Skip to content

Commit 9997747

Browse files
committed
Add support for b4_SSE2 batched mode.
Signed-off-by: Tuomas Tonteri <[email protected]>
1 parent 321c803 commit 9997747

21 files changed

+463
-83
lines changed

.github/workflows/ci.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,17 @@ jobs:
7474
pybind11_ver: v2.5.0
7575
simd: sse4.2
7676
setenvs: export CONAN_LLVM_VERSION=10.0.1
77+
- desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2
78+
nametag: linux-vfx2021
79+
runner: ubuntu-latest
80+
container: aswftesting/ci-osl:2021-clang11
81+
vfxyear: 2021
82+
cxx_std: 17
83+
openimageio_ver: v2.4.13.0
84+
python_ver: 3.7
85+
pybind11_ver: v2.7.0
86+
simd: sse2
87+
batched: b4_SSE2
7788
- desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 avx2 batch-b8avx2
7889
nametag: linux-vfx2021
7990
runner: ubuntu-latest

INSTALL.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ and aarch64), and Windows (x86_64). It may build and run on other platforms as
99
well, but we don't officially support or test other than these platforms.
1010

1111
Shader execution is supported on the native architectures of those x86_64 and
12-
aarch64 platforms, a special batched 8- or 16-wide SIMD execution mode
13-
requiring x86_64 with AVX2 or AVX-512 instructions, as well as on NVIDIA GPUs
14-
using Cuda+OptiX.
12+
aarch64 platforms, a special batched 4-, 8- or 16-wide SIMD execution mode
13+
requiring x86_64 with SSE2, AVX/AVX2 or AVX-512 instructions, as well as on
14+
NVIDIA GPUs using Cuda+OptiX.
1515

1616
Dependencies
1717
------------

src/cmake/compiler.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ endif ()
329329
#
330330
# The USE_BATCHED option may be set to indicate that support for batched
331331
# SIMD shader execution be compiled along with targe specific libraries
332-
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
332+
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
333333
option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
334334
set (BATCHED_SUPPORT_DEFINES "")
335335
set (BATCHED_TARGET_LIBS "")

src/include/OSL/batched_texture.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ static_assert(std::alignment_of<VaryingTextureOptions<16>>::value
4949
static_assert(std::alignment_of<VaryingTextureOptions<8>>::value
5050
== VecReg<8>::alignment,
5151
"Expect alignment of data member to set alignment of struct");
52+
static_assert(std::alignment_of<VaryingTextureOptions<4>>::value
53+
== VecReg<4>::alignment,
54+
"Expect alignment of data member to set alignment of struct");
5255

5356
template<int WidthT> struct BatchedTextureOptions {
5457
VaryingTextureOptions<WidthT> varying;
@@ -90,11 +93,15 @@ static_assert(std::alignment_of<BatchedTextureOptions<16>>::value
9093
static_assert(std::alignment_of<BatchedTextureOptions<8>>::value
9194
== VecReg<8>::alignment,
9295
"Expect alignment of data member to set alignment of struct");
96+
static_assert(std::alignment_of<BatchedTextureOptions<4>>::value
97+
== VecReg<4>::alignment,
98+
"Expect alignment of data member to set alignment of struct");
9399

94100
#ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH
95101
// Code here is to validate our OSL BatchedTextureOptions<WidthT> is binary compatible
96102
// and safe to reinterpret_cast<TextureOptBatch*>
97-
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8),
103+
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8)
104+
|| (OIIO::Tex::BatchWidth == 4),
98105
"This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16");
99106

100107
namespace validate_offsets {

src/include/OSL/llvm_util.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util {
693693
llvm::Constant* constant(uint32_t i);
694694

695695
/// Return an llvm::Constant holding the given integer constant.
696+
llvm::Constant* constant4(int8_t i);
697+
llvm::Constant* constant4(uint8_t i);
696698
llvm::Constant* constant8(int8_t i);
697699
llvm::Constant* constant8(uint8_t i);
698700
llvm::Constant* constant16(int16_t i);
@@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util {
12291231

12301232
llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index);
12311233
llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index);
1234+
llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index);
12321235
std::array<llvm::Value*, 2> op_split_16x(llvm::Value* vector_val);
12331236
std::array<llvm::Value*, 2> op_split_8x(llvm::Value* vector_val);
12341237
std::array<llvm::Value*, 4> op_quarter_16x(llvm::Value* vector_val);

src/include/OSL/rendererservices.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
601601
/// Unless overridden, a nullptr is returned.
602602
virtual BatchedRendererServices<16>* batched(WidthOf<16>);
603603
virtual BatchedRendererServices<8>* batched(WidthOf<8>);
604+
virtual BatchedRendererServices<4>* batched(WidthOf<4>);
604605

605606
protected:
606607
TextureSystem* m_texturesys; // A place to hold a TextureSystem

src/liboslexec/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
380380
list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
381381
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
382382
list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
383+
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
384+
list (APPEND TARGET_CXX_OPTS "-march=core2")
383385
else ()
384386
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
385387
endif ()
@@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
455457
list (APPEND TARGET_CXX_OPTS "-march=haswell")
456458
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
457459
list (APPEND TARGET_CXX_OPTS "-march=sandybridge")
460+
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
461+
list (APPEND TARGET_CXX_OPTS "-march=core2")
458462
else ()
459463
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
460464
endif ()

src/liboslexec/batched_analysis.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1813,10 +1813,16 @@ struct Analyzer {
18131813
// specific BatchedRendererServices.
18141814
// Right here we don't know which width will be used,
18151815
// so we will just require all widths provide the same answer
1816+
auto rs4 = m_ba.renderer()->batched(WidthOf<4>());
18161817
auto rs8 = m_ba.renderer()->batched(WidthOf<8>());
18171818
auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
1818-
if (rs8 || rs16) {
1819+
if (rs4 || rs8 || rs16) {
18191820
get_attr_is_uniform = true;
1821+
if (rs4) {
1822+
get_attr_is_uniform
1823+
&= rs4->is_attribute_uniform(obj_name,
1824+
attr_name);
1825+
}
18201826
if (rs8) {
18211827
get_attr_is_uniform
18221828
&= rs8->is_attribute_uniform(obj_name,

src/liboslexec/batched_backendllvm.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
141141
switch (vector_width()) {
142142
case 16: m_true_mask_value = Mask<16>(true).value(); break;
143143
case 8: m_true_mask_value = Mask<8>(true).value(); break;
144+
case 4: m_true_mask_value = Mask<4>(true).value(); break;
144145
default: OSL_ASSERT(0 && "unsupported vector width");
145146
}
146147
ll.dumpasm(shadingsys.m_llvm_dumpasm);

src/liboslexec/batched_llvm_instance.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,33 @@ const char*
537537
= "b8_AVX_";
538538
#endif
539539

540+
#ifdef __OSL_SUPPORTS_b4_SSE2
541+
template<>
542+
const NameAndSignature
543+
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
544+
= {
545+
# define DECL_INDIRECT(name, signature) \
546+
NameAndSignature { #name, signature },
547+
# define DECL(name, signature) DECL_INDIRECT(name, signature)
548+
# define __OSL_WIDTH 4
549+
# define __OSL_TARGET_ISA SSE2
550+
// Don't allow order of xmacro includes be rearranged
551+
// clang-format off
552+
# include "wide/define_opname_macros.h"
553+
# include "builtindecl_wide_xmacro.h"
554+
# include "wide/undef_opname_macros.h"
555+
// clang-format on
556+
# undef __OSL_TARGET_ISA
557+
# undef __OSL_WIDTH
558+
# undef DECL
559+
# undef DECL_INDIRECT
560+
};
561+
template<>
562+
const char*
563+
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
564+
= "b4_SSE2_";
565+
#endif
566+
540567

541568

542569
std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
@@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
592619
default: break;
593620
}
594621
break;
622+
case 4:
623+
switch (target_isa) {
624+
#ifdef __OSL_SUPPORTS_b4_SSE2
625+
case TargetISA::x64:
626+
return RetType(
627+
new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
628+
#endif
629+
default: break;
630+
}
631+
break;
632+
595633
default: OSL_ASSERT(0 && "unsupported vector width");
596634
}
597635
std::cerr << "Build is not configured to support TargetISA of "
@@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
735773
{
736774
std::vector<unsigned int> offset_by_index;
737775
switch (m_width) {
776+
case 4:
777+
build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
778+
break;
738779
case 8:
739780
build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
740781
break;
@@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
26982739
{
26992740
std::vector<unsigned int> offset_by_index;
27002741
switch (m_width) {
2742+
case 4:
2743+
build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
2744+
break;
27012745
case 8:
27022746
build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
27032747
break;

src/liboslexec/batched_rendservices.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,5 +328,6 @@ BatchedRendererServices<WidthT>::getmessage(BatchedShaderGlobals* bsg,
328328
// Explicitly instantiate BatchedRendererServices template
329329
template class OSLEXECPUBLIC BatchedRendererServices<16>;
330330
template class OSLEXECPUBLIC BatchedRendererServices<8>;
331+
template class OSLEXECPUBLIC BatchedRendererServices<4>;
331332

332333
OSL_NAMESPACE_EXIT

src/liboslexec/context.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg)
674674
// Explicit template instantiation for supported batch sizes
675675
template class ShadingContext::Batched<16>;
676676
template class ShadingContext::Batched<8>;
677+
template class ShadingContext::Batched<4>;
677678
#endif
678679

679680

src/liboslexec/llvm_passes.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final
435435
// including this file will need its own static members defined. LLVM will
436436
// assign IDs when they get registered, so this initialization value is not
437437
// important.
438+
template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0;
439+
438440
template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0;
439441

440442
template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0;

0 commit comments

Comments
 (0)