@@ -519,7 +519,7 @@ LLVM_Util::LLVM_Util(const PerThreadInfo& per_thread_info, int debuglevel,
519
519
// TODO: why are there casts to the base class llvm::Type *?
520
520
m_vector_width = OIIO::floor2 (OIIO::clamp (m_vector_width, 4 , 16 ));
521
521
m_llvm_type_wide_float = llvm_vector_type (m_llvm_type_float,
522
- m_vector_width);
522
+ m_vector_width);
523
523
m_llvm_type_wide_double = llvm_vector_type (m_llvm_type_double,
524
524
m_vector_width);
525
525
m_llvm_type_wide_int = llvm_vector_type (m_llvm_type_int, m_vector_width);
@@ -790,8 +790,8 @@ LLVM_Util::debug_push_inlined_function(OIIO::ustring function_name,
790
790
method_scope_line, // Scope Line,
791
791
fnFlags,
792
792
llvm::DISubprogram::toSPFlags (true /* isLocalToUnit*/ ,
793
- true /* isDefinition*/ ,
794
- true /* false*/ /* isOptimized*/ ));
793
+ true /* isDefinition*/ ,
794
+ true /* false*/ /* isOptimized*/ ));
795
795
796
796
mLexicalBlocks .push_back (function);
797
797
}
@@ -3698,12 +3698,21 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
3698
3698
// Convert <4 x i1> -> <4 x i32>
3699
3699
llvm::Value* w4_int_mask = builder ().CreateSExt (mask,
3700
3700
type_wide_int ());
3701
+
3702
+ // Now we will use the horizontal sign extraction intrinsic
3703
+ // to build a 32 bit mask value. However the only 256bit
3704
+ // version works on floats, so we will cast from int32 to
3705
+ // float beforehand
3706
+ llvm::Type* w4_float_type = llvm_vector_type (m_llvm_type_float, 4 );
3707
+ llvm::Value* w4_float_mask = builder ().CreateBitCast (w4_int_mask,
3708
+ w4_float_type);
3709
+
3701
3710
// Now we will use the horizontal sign extraction intrinsic
3702
3711
// to build a 32 bit mask value.
3703
3712
llvm::Function* func = llvm::Intrinsic::getDeclaration (
3704
- module (), llvm::Intrinsic::x86_sse2_pmovmskb_128 );
3713
+ module (), llvm::Intrinsic::x86_sse_movmsk_ps );
3705
3714
3706
- llvm::Value* args[1 ] = { w4_int_mask };
3715
+ llvm::Value* args[1 ] = { w4_float_mask };
3707
3716
llvm::Value* int8_mask;
3708
3717
int8_mask = builder ().CreateCall (func, toArrayRef (args));
3709
3718
return int8_mask;
@@ -3727,18 +3736,28 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
3727
3736
auto w4_int_masks = op_quarter_16x (wide_int_mask);
3728
3737
3729
3738
// Now we will use the horizontal sign extraction intrinsic
3730
- // to build a 32 bit mask value.
3739
+ // to build a 32 bit mask value. However the only 128bit
3740
+ // version works on floats, so we will cast from int32 to
3741
+ // float beforehand
3742
+ llvm::Type* w4_float_type = llvm_vector_type (m_llvm_type_float, 4 );
3743
+ std::array<llvm::Value*, 4 > w4_float_masks = {
3744
+ { builder ().CreateBitCast (w4_int_masks[0 ], w4_float_type),
3745
+ builder ().CreateBitCast (w4_int_masks[1 ], w4_float_type),
3746
+ builder ().CreateBitCast (w4_int_masks[2 ], w4_float_type),
3747
+ builder ().CreateBitCast (w4_int_masks[3 ], w4_float_type) }
3748
+ };
3749
+
3731
3750
llvm::Function* func = llvm::Intrinsic::getDeclaration (
3732
- module (), llvm::Intrinsic::x86_sse2_pmovmskb_128 );
3751
+ module (), llvm::Intrinsic::x86_sse_movmsk_ps );
3733
3752
3734
- llvm::Value* args[1 ] = { w4_int_masks [0 ] };
3753
+ llvm::Value* args[1 ] = { w4_float_masks [0 ] };
3735
3754
std::array<llvm::Value*, 4 > int4_masks;
3736
3755
int4_masks[0 ] = builder ().CreateCall (func, toArrayRef (args));
3737
- args[0 ] = w4_int_masks [1 ];
3756
+ args[0 ] = w4_float_masks [1 ];
3738
3757
int4_masks[1 ] = builder ().CreateCall (func, toArrayRef (args));
3739
- args[0 ] = w4_int_masks [2 ];
3758
+ args[0 ] = w4_float_masks [2 ];
3740
3759
int4_masks[2 ] = builder ().CreateCall (func, toArrayRef (args));
3741
- args[0 ] = w4_int_masks [3 ];
3760
+ args[0 ] = w4_float_masks [3 ];
3742
3761
int4_masks[3 ] = builder ().CreateCall (func, toArrayRef (args));
3743
3762
3744
3763
llvm::Value* bits12_15 = op_shl (int4_masks[3 ], constant (12 ));
@@ -3759,14 +3778,22 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
3759
3778
auto w4_int_masks = op_split_8x (wide_int_mask);
3760
3779
3761
3780
// Now we will use the horizontal sign extraction intrinsic
3762
- // to build a 32 bit mask value.
3781
+ // to build a 32 bit mask value. However the only 128bit
3782
+ // version works on floats, so we will cast from int32 to
3783
+ // float beforehand
3784
+ llvm::Type* w4_float_type = llvm_vector_type (m_llvm_type_float, 4 );
3785
+ std::array<llvm::Value*, 2 > w4_float_masks = {
3786
+ { builder ().CreateBitCast (w4_int_masks[0 ], w4_float_type),
3787
+ builder ().CreateBitCast (w4_int_masks[1 ], w4_float_type) }
3788
+ };
3789
+
3763
3790
llvm::Function* func = llvm::Intrinsic::getDeclaration (
3764
- module (), llvm::Intrinsic::x86_sse2_pmovmskb_128 );
3791
+ module (), llvm::Intrinsic::x86_sse_movmsk_ps );
3765
3792
3766
- llvm::Value* args[1 ] = { w4_int_masks [0 ] };
3793
+ llvm::Value* args[1 ] = { w4_float_masks [0 ] };
3767
3794
std::array<llvm::Value*, 2 > int4_masks;
3768
3795
int4_masks[0 ] = builder ().CreateCall (func, toArrayRef (args));
3769
- args[0 ] = w4_int_masks [1 ];
3796
+ args[0 ] = w4_float_masks [1 ];
3770
3797
int4_masks[1 ] = builder ().CreateCall (func, toArrayRef (args));
3771
3798
3772
3799
llvm::Value* bits4_7 = op_shl (int4_masks[1 ], constant (4 ));
@@ -3782,12 +3809,20 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
3782
3809
llvm::Value* w4_int_mask = builder ().CreateSExt (mask,
3783
3810
type_wide_int ());
3784
3811
3812
+ // Now we will use the horizontal sign extraction intrinsic
3813
+ // to build a 32 bit mask value. However the only 256bit
3814
+ // version works on floats, so we will cast from int32 to
3815
+ // float beforehand
3816
+ llvm::Type* w4_float_type = llvm_vector_type (m_llvm_type_float, 4 );
3817
+ llvm::Value* w4_float_mask = builder ().CreateBitCast (w4_int_mask,
3818
+ w4_float_type);
3819
+
3785
3820
// Now we will use the horizontal sign extraction intrinsic
3786
3821
// to build a 32 bit mask value.
3787
3822
llvm::Function* func = llvm::Intrinsic::getDeclaration (
3788
- module (), llvm::Intrinsic::x86_sse2_pmovmskb_128 );
3823
+ module (), llvm::Intrinsic::x86_sse_movmsk_ps );
3789
3824
3790
- llvm::Value* args[1 ] = { w4_int_mask };
3825
+ llvm::Value* args[1 ] = { w4_float_mask };
3791
3826
llvm::Value* int4_mask = builder ().CreateCall (func,
3792
3827
toArrayRef (args));
3793
3828
@@ -3833,12 +3868,20 @@ LLVM_Util::mask4_as_int8(llvm::Value* mask)
3833
3868
// Convert <4 x i1> -> <4 x i32>
3834
3869
llvm::Value* w4_int_mask = builder ().CreateSExt (mask, type_wide_int ());
3835
3870
3871
+ // Now we will use the horizontal sign extraction intrinsic
3872
+ // to build a 32 bit mask value. However the only 256bit
3873
+ // version works on floats, so we will cast from int32 to
3874
+ // float beforehand
3875
+ llvm::Type* w4_float_type = llvm_vector_type (m_llvm_type_float, 4 );
3876
+ llvm::Value* w4_float_mask = builder ().CreateBitCast (w4_int_mask,
3877
+ w4_float_type);
3878
+
3836
3879
// Now we will use the horizontal sign extraction intrinsic
3837
3880
// to build a 32 bit mask value.
3838
3881
llvm::Function* func = llvm::Intrinsic::getDeclaration (
3839
- module (), llvm::Intrinsic::x86_sse2_pmovmskb_128 );
3882
+ module (), llvm::Intrinsic::x86_sse_movmsk_ps );
3840
3883
3841
- llvm::Value* args[1 ] = { w4_int_mask };
3884
+ llvm::Value* args[1 ] = { w4_float_mask };
3842
3885
llvm::Value* int32 = builder ().CreateCall (func, toArrayRef (args));
3843
3886
llvm::Value* i8 = builder ().CreateIntCast (int32, type_int8 (), true );
3844
3887
@@ -4685,7 +4728,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
4685
4728
4686
4729
llvm::Value* unmasked_value = wide_constant (0 );
4687
4730
llvm::Value* args[] = { unmasked_value, void_ptr (src_ptr),
4688
- wide_index, int_mask, constant (4 ) };
4731
+ wide_index, int_mask, constant (4 ) };
4689
4732
return builder ().CreateCall (func_avx512_gather_pi,
4690
4733
toArrayRef (args));
4691
4734
} else if (m_supports_avx2) {
@@ -4705,8 +4748,8 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
4705
4748
auto w8_int_masks = op_split_16x (wide_int_mask);
4706
4749
auto w8_int_indices = op_split_16x (wide_index);
4707
4750
llvm::Value* args[] = { avx2_unmasked_value, void_ptr (src_ptr),
4708
- w8_int_indices[0 ], w8_int_masks[0 ],
4709
- constant8 ((uint8_t )4 ) };
4751
+ w8_int_indices[0 ], w8_int_masks[0 ],
4752
+ constant8 ((uint8_t )4 ) };
4710
4753
llvm::Value* gather1 = builder ().CreateCall (func_avx2_gather_pi,
4711
4754
toArrayRef (args));
4712
4755
args[2 ] = w8_int_indices[1 ];
@@ -4794,8 +4837,8 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
4794
4837
toArrayRef (args));
4795
4838
args[2 ] = w8_int_indices[1 ];
4796
4839
args[3 ] = builder ().CreateBitCast (w8_int_masks[1 ],
4797
- llvm_vector_type (type_float (),
4798
- 8 ));
4840
+ llvm_vector_type (type_float (),
4841
+ 8 ));
4799
4842
llvm::Value* gather2 = builder ().CreateCall (func_avx2_gather_ps,
4800
4843
toArrayRef (args));
4801
4844
return op_combine_8x_vectors (gather1, gather2);
@@ -4990,8 +5033,8 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
4990
5033
toArrayRef (args));
4991
5034
args[2 ] = w8_int_indices[1 ];
4992
5035
args[3 ] = builder ().CreateBitCast (w8_int_masks[1 ],
4993
- llvm_vector_type (type_float (),
4994
- 8 ));
5036
+ llvm_vector_type (type_float (),
5037
+ 8 ));
4995
5038
llvm::Value* gather2 = builder ().CreateCall (func_avx2_gather_ps,
4996
5039
toArrayRef (args));
4997
5040
return op_combine_8x_vectors (gather1, gather2);
@@ -5092,8 +5135,8 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr,
5092
5135
auto w8_int_indices = op_split_16x (
5093
5136
op_linearize_16x_indices (wide_index));
5094
5137
llvm::Value* args[] = { avx2_unmasked_value, void_ptr (src_ptr),
5095
- w8_int_indices[0 ], w8_int_masks[0 ],
5096
- constant8 ((uint8_t )4 ) };
5138
+ w8_int_indices[0 ], w8_int_masks[0 ],
5139
+ constant8 ((uint8_t )4 ) };
5097
5140
llvm::Value* gather1 = builder ().CreateCall (func_avx2_gather_pi,
5098
5141
toArrayRef (args));
5099
5142
args[2 ] = w8_int_indices[1 ];
@@ -5863,9 +5906,9 @@ LLVM_Util::apply_return_to(llvm::Value* existing_mask)
5863
5906
OSL_ASSERT (masked_function_context ().return_count > 0 );
5864
5907
5865
5908
llvm::Value* loc_of_return_mask = masked_function_context ().location_of_mask ;
5866
- llvm::Value* rs_mask = op_load_mask (loc_of_return_mask);
5867
- llvm::Value* result = builder ().CreateSelect (rs_mask, existing_mask,
5868
- rs_mask);
5909
+ llvm::Value* rs_mask = op_load_mask (loc_of_return_mask);
5910
+ llvm::Value* result = builder ().CreateSelect (rs_mask, existing_mask,
5911
+ rs_mask);
5869
5912
return result;
5870
5913
}
5871
5914
0 commit comments