Skip to content

Commit 52fb623

Browse files
committed
Add support for b4_SSE2 batched mode (2)
1 parent 0d122e7 commit 52fb623

30 files changed

+754
-17
lines changed

.github/workflows/ci.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,17 @@ jobs:
7474
pybind11_ver: v2.5.0
7575
simd: sse4.2
7676
setenvs: export CONAN_LLVM_VERSION=10.0.1
77+
- desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2
78+
nametag: linux-vfx2021
79+
runner: ubuntu-latest
80+
container: aswftesting/ci-osl:2021-clang11
81+
vfxyear: 2021
82+
cxx_std: 17
83+
openimageio_ver: v2.4.13.0
84+
python_ver: 3.7
85+
pybind11_ver: v2.7.0
86+
simd: sse2
87+
batched: b4_SSE2
7788
- desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 avx2 batch-b8avx2
7889
nametag: linux-vfx2021
7990
runner: ubuntu-latest

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ else ()
9595
endif ()
9696
set (OSL_LIBNAME_SUFFIX "" CACHE STRING
9797
"Optional name appended to ${PROJECT_NAME} libraries that are built")
98-
option (OSL_BUILD_TESTS "Build the unit tests, testshade, testrender" ON)
98+
option (OSL_BUILD_TESTS "Build the unit tests, testminimal, testshade, testrender" ON)
9999
if (WIN32)
100100
option (USE_LLVM_BITCODE "Generate embedded LLVM bitcode" OFF)
101101
else ()
@@ -220,6 +220,7 @@ add_subdirectory (src/oslc)
220220
add_subdirectory (src/oslinfo)
221221

222222
if (OSL_BUILD_TESTS AND BUILD_TESTING)
223+
add_subdirectory (src/testminimal)
223224
add_subdirectory (src/testshade)
224225
add_subdirectory (src/testrender)
225226
endif ()

src/cmake/compiler.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ endif ()
329329
#
330330
# The USE_BATCHED option may be set to indicate that support for batched
331331
# SIMD shader execution be compiled along with targe specific libraries
332-
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
332+
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
333333
option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
334334
set (BATCHED_SUPPORT_DEFINES "")
335335
set (BATCHED_TARGET_LIBS "")

src/cmake/testing.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ macro (osl_add_all_tests)
270270
bug-array-heapoffsets bug-locallifetime bug-outputinit
271271
bug-param-duplicate bug-peep bug-return
272272
calculatenormal-reg
273-
cellnoise closure closure-array closure-layered closure-parameters closure-zero closure-conditional
273+
cellnoise closure closure-array closure-layered closure-parameters closure-string closure-zero closure-conditional
274274
color color-reg colorspace comparison
275275
complement-reg compile-buffer compassign-bool compassign-reg
276276
component-range

src/include/OSL/batched_texture.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ static_assert(std::alignment_of<VaryingTextureOptions<16>>::value
4949
static_assert(std::alignment_of<VaryingTextureOptions<8>>::value
5050
== VecReg<8>::alignment,
5151
"Expect alignment of data member to set alignment of struct");
52+
static_assert(std::alignment_of<VaryingTextureOptions<4>>::value
53+
== VecReg<4>::alignment,
54+
"Expect alignment of data member to set alignment of struct");
5255

5356
template<int WidthT> struct BatchedTextureOptions {
5457
VaryingTextureOptions<WidthT> varying;
@@ -90,11 +93,14 @@ static_assert(std::alignment_of<BatchedTextureOptions<16>>::value
9093
static_assert(std::alignment_of<BatchedTextureOptions<8>>::value
9194
== VecReg<8>::alignment,
9295
"Expect alignment of data member to set alignment of struct");
96+
static_assert(std::alignment_of<BatchedTextureOptions<4>>::value
97+
== VecReg<4>::alignment,
98+
"Expect alignment of data member to set alignment of struct");
9399

94100
#ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH
95101
// Code here is to validate our OSL BatchedTextureOptions<WidthT> is binary compatible
96102
// and safe to reinterpret_cast<TextureOptBatch*>
97-
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8),
103+
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8) || (OIIO::Tex::BatchWidth == 4),
98104
"This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16");
99105

100106
namespace validate_offsets {

src/include/OSL/rendererservices.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
601601
/// Unless overridden, a nullptr is returned.
602602
virtual BatchedRendererServices<16>* batched(WidthOf<16>);
603603
virtual BatchedRendererServices<8>* batched(WidthOf<8>);
604+
virtual BatchedRendererServices<4>* batched(WidthOf<4>);
604605

605606
protected:
606607
TextureSystem* m_texturesys; // A place to hold a TextureSystem

src/liboslexec/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
380380
list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
381381
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
382382
list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
383+
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
384+
list (APPEND TARGET_CXX_OPTS "-march=core2")
383385
else ()
384386
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
385387
endif ()
@@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
455457
list (APPEND TARGET_CXX_OPTS "-march=haswell")
456458
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
457459
list (APPEND TARGET_CXX_OPTS "-march=sandybridge")
460+
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
461+
list (APPEND TARGET_CXX_OPTS "-march=core2")
458462
else ()
459463
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
460464
endif ()

src/liboslexec/batched_analysis.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1813,10 +1813,16 @@ struct Analyzer {
18131813
// specific BatchedRendererServices.
18141814
// Right here we don't know which width will be used,
18151815
// so we will just require all widths provide the same answer
1816+
auto rs4 = m_ba.renderer()->batched(WidthOf<4>());
18161817
auto rs8 = m_ba.renderer()->batched(WidthOf<8>());
18171818
auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
1818-
if (rs8 || rs16) {
1819+
if (rs4 || rs8 || rs16) {
18191820
get_attr_is_uniform = true;
1821+
if (rs4) {
1822+
get_attr_is_uniform
1823+
&= rs4->is_attribute_uniform(obj_name,
1824+
attr_name);
1825+
}
18201826
if (rs8) {
18211827
get_attr_is_uniform
18221828
&= rs8->is_attribute_uniform(obj_name,

src/liboslexec/batched_backendllvm.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
141141
switch (vector_width()) {
142142
case 16: m_true_mask_value = Mask<16>(true).value(); break;
143143
case 8: m_true_mask_value = Mask<8>(true).value(); break;
144+
case 4: m_true_mask_value = Mask<4>(true).value(); break;
144145
default: OSL_ASSERT(0 && "unsupported vector width");
145146
}
146147
ll.dumpasm(shadingsys.m_llvm_dumpasm);

src/liboslexec/batched_llvm_instance.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,33 @@ const char*
537537
= "b8_AVX_";
538538
#endif
539539

540+
#ifdef __OSL_SUPPORTS_b4_SSE2
541+
template<>
542+
const NameAndSignature
543+
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
544+
= {
545+
# define DECL_INDIRECT(name, signature) \
546+
NameAndSignature { #name, signature },
547+
# define DECL(name, signature) DECL_INDIRECT(name, signature)
548+
# define __OSL_WIDTH 4
549+
# define __OSL_TARGET_ISA SSE2
550+
// Don't allow order of xmacro includes be rearranged
551+
// clang-format off
552+
# include "wide/define_opname_macros.h"
553+
# include "builtindecl_wide_xmacro.h"
554+
# include "wide/undef_opname_macros.h"
555+
// clang-format on
556+
# undef __OSL_TARGET_ISA
557+
# undef __OSL_WIDTH
558+
# undef DECL
559+
# undef DECL_INDIRECT
560+
};
561+
template<>
562+
const char*
563+
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
564+
= "b4_SSE2_";
565+
#endif
566+
540567

541568

542569
std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
@@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
592619
default: break;
593620
}
594621
break;
622+
case 4:
623+
switch (target_isa) {
624+
#ifdef __OSL_SUPPORTS_b4_SSE2
625+
case TargetISA::x64:
626+
return RetType(
627+
new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
628+
#endif
629+
default: break;
630+
}
631+
break;
632+
595633
default: OSL_ASSERT(0 && "unsupported vector width");
596634
}
597635
std::cerr << "Build is not configured to support TargetISA of "
@@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
735773
{
736774
std::vector<unsigned int> offset_by_index;
737775
switch (m_width) {
776+
case 4:
777+
build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
778+
break;
738779
case 8:
739780
build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
740781
break;
@@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
26982739
{
26992740
std::vector<unsigned int> offset_by_index;
27002741
switch (m_width) {
2742+
case 4:
2743+
build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
2744+
break;
27012745
case 8:
27022746
build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
27032747
break;

src/liboslexec/batched_rendservices.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,5 +328,6 @@ BatchedRendererServices<WidthT>::getmessage(BatchedShaderGlobals* bsg,
328328
// Explicitly instantiate BatchedRendererServices template
329329
template class OSLEXECPUBLIC BatchedRendererServices<16>;
330330
template class OSLEXECPUBLIC BatchedRendererServices<8>;
331+
template class OSLEXECPUBLIC BatchedRendererServices<4>;
331332

332333
OSL_NAMESPACE_EXIT

src/liboslexec/context.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg)
674674
// Explicit template instantiation for supported batch sizes
675675
template class ShadingContext::Batched<16>;
676676
template class ShadingContext::Batched<8>;
677+
template class ShadingContext::Batched<4>;
677678
#endif
678679

679680

src/liboslexec/llvm_passes.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final
435435
// including this file will need its own static members defined. LLVM will
436436
// assign IDs when they get registered, so this initialization value is not
437437
// important.
438+
template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0;
439+
438440
template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0;
439441

440442
template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0;

src/liboslexec/llvm_util.cpp

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM()
619619

620620
#ifndef OSL_LLVM_NEW_PASS_MANAGER
621621
// LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks
622+
static llvm::RegisterPass<
623+
LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>>
624+
sRegCustomPass2(
625+
"PreventBitMasksFromBeingLiveinsToBasicBlocks<4>",
626+
"Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass",
627+
false /* Only looks at CFG */, false /* Analysis Pass */);
622628
static llvm::RegisterPass<
623629
LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>>
624630
sRegCustomPass0(
@@ -2305,7 +2311,11 @@ LLVM_Util::setup_new_optimization_passes(int optlevel, bool target_host)
23052311
break;
23062312
}
23072313
case 4:
2308-
// We don't use masking or SIMD shading for 4-wide
2314+
// MUST BE THE FINAL PASS!
2315+
m_new_pass_manager->module_pass_manager.addPass(
2316+
createModuleToFunctionPassAdaptor(
2317+
NewPreventBitMasksFromBeingLiveinsToBasicBlocks<4>(
2318+
context())));
23092319
break;
23102320
default:
23112321
std::cout << "m_vector_width = " << m_vector_width << "\n";
@@ -2618,7 +2628,9 @@ LLVM_Util::setup_legacy_optimization_passes(int optlevel, bool target_host)
26182628
new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>());
26192629
break;
26202630
case 4:
2621-
// We don't use masking or SIMD shading for 4-wide
2631+
// MUST BE THE FINAL PASS!
2632+
mpm.add(
2633+
new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>());
26222634
break;
26232635
default:
26242636
std::cout << "m_vector_width = " << m_vector_width << "\n";
@@ -3592,6 +3604,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask)
35923604
// and all types are happy
35933605
intMaskType = type_int8();
35943606
break;
3607+
case 4:
3608+
// We can just reinterpret cast a 4 bit mask to a 8 bit integer
3609+
// and all types are happy
3610+
intMaskType = type_int8();
3611+
break;
35953612
default: OSL_ASSERT(0 && "unsupported native bit mask width");
35963613
};
35973614

@@ -3950,10 +3967,10 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
39503967
// and all types are happy
39513968
intMaskType = type_int8();
39523969
break;
3953-
#if 0 // WIP
3970+
//#if 0 // WIP
39543971
case 4:
39553972
{
3956-
// We can just reinterpret cast a 8 bit mask to a 8 bit integer
3973+
// We can just reinterpret cast a 4 bit mask to a 8 bit integer
39573974
// and all types are happy
39583975
intMaskType = type_int8();
39593976

@@ -3966,7 +3983,7 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask)
39663983
// llvm::Value * mask_as_int = builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type);
39673984
break;
39683985
}
3969-
#endif
3986+
//#endif
39703987
default: OSL_ASSERT(0 && "unsupported native bit mask width");
39713988
};
39723989

src/liboslexec/rendservices.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,4 +524,11 @@ RendererServices::batched(WidthOf<8>)
524524
return nullptr;
525525
}
526526

527+
BatchedRendererServices<4>*
528+
RendererServices::batched(WidthOf<4>)
529+
{
530+
// No default implementation for batched services
531+
return nullptr;
532+
}
533+
527534
OSL_NAMESPACE_EXIT

src/liboslexec/shadingsys.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,29 @@ ShadingSystem::configure_batch_execution_at(int width)
618618
m_impl->attribute("llvm_jit_fma", 0);
619619
return true;
620620
}
621+
# endif
622+
if (target_requested) {
623+
break;
624+
}
625+
// fallthrough
626+
default: return false;
627+
};
628+
return false;
629+
case 4:
630+
switch (requestedISA) {
631+
case TargetISA::UNKNOWN:
632+
// fallthrough
633+
case TargetISA::x64:
634+
# ifdef __OSL_SUPPORTS_b4_SSE2
635+
if (LLVM_Util::supports_isa(TargetISA::x64)) {
636+
if (!target_requested)
637+
m_impl->attribute("llvm_jit_target",
638+
LLVM_Util::target_isa_name(
639+
TargetISA::x64));
640+
// SSE2 doesn't support FMA
641+
m_impl->attribute("llvm_jit_fma", 0);
642+
return true;
643+
}
621644
# endif
622645
if (target_requested) {
623646
break;
@@ -885,6 +908,7 @@ ShadingSystem::BatchedExecutor<WidthT>::jit_all_groups(int nthreads)
885908
// Explicitly instantiate
886909
template class ShadingSystem::BatchedExecutor<16>;
887910
template class ShadingSystem::BatchedExecutor<8>;
911+
template class ShadingSystem::BatchedExecutor<4>;
888912
#endif
889913

890914

@@ -1079,7 +1103,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer,
10791103
, m_opt_groupdata(true)
10801104
#if OSL_USE_BATCHED
10811105
, m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr)
1082-
|| (renderer->batched(WidthOf<8>()) != nullptr))
1106+
|| (renderer->batched(WidthOf<8>()) != nullptr)
1107+
|| (renderer->batched(WidthOf<4>()) != nullptr))
10831108
#else
10841109
, m_opt_batched_analysis(false)
10851110
#endif
@@ -3794,7 +3819,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx,
37943819
// the batch jit has already happened,
37953820
// as it requires the ops so we can't delete them yet!
37963821
if (((renderer()->batched(WidthOf<16>()) == nullptr)
3797-
&& (renderer()->batched(WidthOf<8>()) == nullptr))
3822+
&& (renderer()->batched(WidthOf<8>()) == nullptr)
3823+
&& (renderer()->batched(WidthOf<4>()) == nullptr))
37983824
|| group.batch_jitted()) {
37993825
group_post_jit_cleanup(group);
38003826
}
@@ -4015,6 +4041,7 @@ ShadingSystemImpl::Batched<WidthT>::jit_all_groups(int nthreads, int mythread,
40154041
// machine as well, start with just the batch size
40164042
template class pvt::ShadingSystemImpl::Batched<16>;
40174043
template class pvt::ShadingSystemImpl::Batched<8>;
4044+
template class pvt::ShadingSystemImpl::Batched<4>;
40184045
#endif
40194046

40204047
int

0 commit comments

Comments
 (0)