diff --git a/indra/llmath/llmath.h b/indra/llmath/llmath.h
index deb0584885..80a1ba7028 100644
--- a/indra/llmath/llmath.h
+++ b/indra/llmath/llmath.h
@@ -155,14 +155,14 @@ constexpr F64 llabs(const F64 a) noexcept
     return std::bit_cast<F64>(std::bit_cast<U64>(a) & 0x7fffffffffffffffull);
 }
 
-constexpr S32 lltrunc(F32 f)
+inline S32 lltrunc(F32 f)
 {
-    return narrow(f);
+    return (S32)std::trunc(f);
 }
 
-constexpr S32 lltrunc(F64 f)
+inline S32 lltrunc(F64 f)
 {
-    return narrow(f);
+    return (S32)std::trunc(f);
 }
 
 inline S32 llfloor(F32 f)
@@ -184,67 +184,19 @@ inline S32 llfloor(F32 f)
 #endif
 }
 
-
 inline S32 llceil( F32 f )
 {
     // This could probably be optimized, but this works.
     return (S32)ceil(f);
 }
 
-
-#ifndef BOGUS_ROUND
-// Use this round.  Does an arithmetic round (0.5 always rounds up)
 inline S32 ll_round(const F32 val)
 {
-    return llfloor(val + 0.5f);
+    return (S32)lround(val);
 }
-
-#else // BOGUS_ROUND
-// Old ll_round implementation - does banker's round (toward nearest even in the case of a 0.5.
-// Not using this because we don't have a consistent implementation on both platforms, use
-// llfloor(val + 0.5f), which is consistent on all platforms.
-inline S32 ll_round(const F32 val)
-{
-    #if LL_WINDOWS
-        // Note: assumes that the floating point control word is set to rounding mode (the default)
-        S32 ret_val;
-        _asm fld    val
-        _asm fistp  ret_val;
-        return ret_val;
-    #elif LL_LINUX
-        // Note: assumes that the floating point control word is set
-        // to rounding mode (the default)
-        S32 ret_val;
-        __asm__ __volatile__( "flds %1    \n\t"
-                              "fistpl %0  \n\t"
-                              : "=m" (ret_val)
-                              : "m" (val) );
-        return ret_val;
-    #else
-        return llfloor(val + 0.5f);
-    #endif
-}
-
-// A fast arithmentic round on intel, from Laurent de Soras http://ldesoras.free.fr
-inline int round_int(double x)
-{
-    const float round_to_nearest = 0.5f;
-    int i;
-    __asm
-    {
-        fld x
-        fadd st, st (0)
-        fadd round_to_nearest
-        fistp i
-        sar i, 1
-    }
-    return (i);
-}
-#endif // BOGUS_ROUND
-
 inline F64 ll_round(const F64 val)
 {
-    return F64(floor(val + 0.5f));
+    return round(val);
 }
 
 inline F32 ll_round( F32 val, F32 nearest )
diff --git a/indra/llrender/llrendertarget.cpp b/indra/llrender/llrendertarget.cpp
index c3945220a6..498668b0bf 100644
--- a/indra/llrender/llrendertarget.cpp
+++ b/indra/llrender/llrendertarget.cpp
@@ -641,6 +641,86 @@ void LLRenderTarget::flush()
     }
 }
 
+void LLRenderTarget::copyContents(LLRenderTarget& source, S32 srcX0, S32 srcY0, S32 srcX1, S32 srcY1, S32 dstX0, S32 dstY0, S32 dstX1,
+                                  S32 dstY1, U32 mask, U32 filter)
+{
+    LL_PROFILE_GPU_ZONE("LLRenderTarget::copyContents");
+
+    GLboolean write_depth = mask & GL_DEPTH_BUFFER_BIT ? GL_TRUE : GL_FALSE;
+
+    LLGLDepthTest depth(write_depth, write_depth);
+
+    gGL.flush();
+    if (!source.mFBO || !mFBO)
+    {
+        LL_WARNS() << "Cannot copy framebuffer contents for non FBO render targets." << LL_ENDL;
+        return;
+    }
+
+    if (mask == GL_DEPTH_BUFFER_BIT && source.mStencil != mStencil)
+    {
+        stop_glerror();
+
+        glBindFramebuffer(GL_FRAMEBUFFER, source.mFBO);
+        check_framebuffer_status();
+        gGL.getTexUnit(0)->bind(this, true);
+        stop_glerror();
+        // glCopyTexSubImage2D(target, level, xoffset, yoffset, x, y, width, height):
+        // xoffset/yoffset are the destination texel offset, x/y the source framebuffer
+        // origin, and the last two are dimensions (not endpoints).
+        glCopyTexSubImage2D(LLTexUnit::getInternalType(mUsage), 0, dstX0, dstY0, srcX0, srcY0, srcX1 - srcX0, srcY1 - srcY0);
+        stop_glerror();
+        glBindFramebuffer(GL_FRAMEBUFFER, sCurFBO);
+        stop_glerror();
+    }
+    else
+    {
+        glBindFramebuffer(GL_READ_FRAMEBUFFER, source.mFBO);
+        stop_glerror();
+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, mFBO);
+        stop_glerror();
+        check_framebuffer_status();
+        stop_glerror();
+        glBlitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter);
+        stop_glerror();
+        glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+        stop_glerror();
+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+        stop_glerror();
+        glBindFramebuffer(GL_FRAMEBUFFER, sCurFBO);
+        stop_glerror();
+    }
+}
+
+// static
+void LLRenderTarget::copyContentsToFramebuffer(LLRenderTarget& source, S32 srcX0, S32 srcY0, S32 srcX1, S32 srcY1, S32 dstX0, S32 dstY0,
+                                               S32 dstX1, S32 dstY1, U32 mask, U32 filter)
+{
+    if (!source.mFBO)
+    {
+        LL_WARNS() << "Cannot copy framebuffer contents for non FBO render targets." << LL_ENDL;
+        return;
+    }
+
+    {
+        LL_PROFILE_GPU_ZONE("copyContentsToFramebuffer");
+        GLboolean write_depth = mask & GL_DEPTH_BUFFER_BIT ? GL_TRUE : GL_FALSE;
+
+        LLGLDepthTest depth(write_depth, write_depth);
+
+        glBindFramebuffer(GL_READ_FRAMEBUFFER, source.mFBO);
+        stop_glerror();
+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+        stop_glerror();
+        check_framebuffer_status();
+        stop_glerror();
+        glBlitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter);
+        stop_glerror();
+        glBindFramebuffer(GL_FRAMEBUFFER, sCurFBO);
+        stop_glerror();
+    }
+}
+
 bool LLRenderTarget::isComplete() const
 {
     return !mTex.empty() || mDepth;
diff --git a/indra/llrender/llrendertarget.h b/indra/llrender/llrendertarget.h
index 52ba645e34..11d0a8602f 100644
--- a/indra/llrender/llrendertarget.h
+++ b/indra/llrender/llrendertarget.h
@@ -170,6 +170,12 @@ class LLRenderTarget
     // asserts  that this target is currently bound
     void flush();
 
+    void copyContents(LLRenderTarget& source, S32 srcX0, S32 srcY0, S32 srcX1, S32 srcY1, S32 dstX0, S32 dstY0, S32 dstX1, S32 dstY1,
+                      U32 mask, U32 filter);
+
+    static void copyContentsToFramebuffer(LLRenderTarget& source, S32 srcX0, S32 srcY0, S32 srcX1, S32 srcY1, S32 dstX0, S32 dstY0,
+                                          S32 dstX1, S32 dstY1, U32 mask, U32 filter);
+
     //Returns TRUE if target is ready to be rendered into.
     //That is, if the target has been allocated with at least
     //one renderable attachment (i.e. color buffer, depth buffer).
diff --git a/indra/newview/app_settings/settings_alchemy.xml b/indra/newview/app_settings/settings_alchemy.xml
index b323c0f901..306457b7fb 100644
--- a/indra/newview/app_settings/settings_alchemy.xml
+++ b/indra/newview/app_settings/settings_alchemy.xml
@@ -1158,6 +1158,28 @@
             <key>Value</key>
             <integer>0</integer>
         </map>
+        <key>RenderAvatarShadowDetail</key>
+        <map>
+            <key>Comment</key>
+            <string>Which avatar passes cast shadows. 0 = opaque only, 1 = opaque + alpha mask, 2 = full (also alpha blend). Lower values speed up crowd scenes by skipping the expensive alpha-blend avatar shadow pass across all cascades.</string>
+            <key>Persist</key>
+            <integer>1</integer>
+            <key>Type</key>
+            <string>S32</string>
+            <key>Value</key>
+            <integer>2</integer>
+        </map>
+        <key>RenderShadowCullMode</key>
+        <map>
+            <key>Comment</key>
+            <string>How sun shadow cascades are culled. 0 = cull and sort each cascade separately (default). 1 = cull and sort once against a frustum spanning all cascades, sharing the result (less CPU per frame, more GPU vertex work per cascade). Experimental.</string>
+            <key>Persist</key>
+            <integer>1</integer>
+            <key>Type</key>
+            <string>S32</string>
+            <key>Value</key>
+            <integer>0</integer>
+        </map>
         <key>RenderBloomHDR</key>
         <map>
             <key>Comment</key>
diff --git a/indra/newview/lldrawpool.cpp b/indra/newview/lldrawpool.cpp
index 3eca6059ed..56a3baed34 100644
--- a/indra/newview/lldrawpool.cpp
+++ b/indra/newview/lldrawpool.cpp
@@ -786,6 +786,8 @@ void LLRenderPass::pushGLTFBatches(U32 type, bool textured)
 void LLRenderPass::pushGLTFBatches(U32 type)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_DRAWPOOL;
+    LLFetchedGLTFMaterial* lastMat = nullptr;
+    LLViewerTexture* lastTex = nullptr;
     auto* begin = gPipeline.beginRenderMap(type);
     auto* end = gPipeline.endRenderMap(type);
     for (LLCullResult::drawinfo_iterator i = begin; i != end; )
@@ -794,7 +796,7 @@ void LLRenderPass::pushGLTFBatches(U32 type)
         LLDrawInfo& params = **i;
         LLCullResult::increment_iterator(i, end);
 
-        pushGLTFBatch(params);
+        pushGLTFBatch(params, lastMat, lastTex);
     }
 }
 
@@ -814,16 +816,25 @@ void LLRenderPass::pushUntexturedGLTFBatches(U32 type)
 }
 
 // static
-void LLRenderPass::pushGLTFBatch(LLDrawInfo& params)
+void LLRenderPass::pushGLTFBatch(LLDrawInfo& params, LLFetchedGLTFMaterial*& lastMat, LLViewerTexture*& lastTex)
 {
-    auto& mat = params.mGLTFMaterial;
+    LLFetchedGLTFMaterial* mat = params.mGLTFMaterial.get();
 
-    if (mat.notNull())
+    if (mat)
     {
-        mat->bind(params.mTexture);
+        // params.mTexture is the media override (bind() applies it to base color
+        // and emissive), so it is part of the cache key -- otherwise media faces
+        // sharing a material would render with a stale base texture.
+        LLViewerTexture* tex = params.mTexture.get();
+        if (mat != lastMat || tex != lastTex)
+        {
+            mat->bind(params.mTexture);
+            lastMat = mat;
+            lastTex = tex;
+        }
     }
 
-    LLGLDisable cull_face(mat.notNull() && mat->mDoubleSided ? GL_CULL_FACE : 0);
+    LLGLDisable cull_face(mat && mat->mDoubleSided ? GL_CULL_FACE : 0);
 
     setup_texture_matrix(params);
 
@@ -866,6 +877,8 @@ void LLRenderPass::pushRiggedGLTFBatches(U32 type)
     const LLVOAvatar* lastAvatar = nullptr;
     U64 lastMeshId = 0;
     bool skipLastSkin = false;
+    LLFetchedGLTFMaterial* lastMat = nullptr;
+    LLViewerTexture* lastTex = nullptr;
 
     auto* begin = gPipeline.beginRenderMap(type);
     auto* end = gPipeline.endRenderMap(type);
@@ -875,7 +888,7 @@ void LLRenderPass::pushRiggedGLTFBatches(U32 type)
         LLDrawInfo& params = **i;
         LLCullResult::increment_iterator(i, end);
 
-        pushRiggedGLTFBatch(params, lastAvatar, lastMeshId, skipLastSkin);
+        pushRiggedGLTFBatch(params, lastAvatar, lastMeshId, skipLastSkin, lastMat, lastTex);
     }
 }
 
@@ -900,11 +913,11 @@ void LLRenderPass::pushUntexturedRiggedGLTFBatches(U32 type)
 
 
 // static
-void LLRenderPass::pushRiggedGLTFBatch(LLDrawInfo& params, const LLVOAvatar*& lastAvatar, U64& lastMeshId, bool& skipLastSkin)
+void LLRenderPass::pushRiggedGLTFBatch(LLDrawInfo& params, const LLVOAvatar*& lastAvatar, U64& lastMeshId, bool& skipLastSkin, LLFetchedGLTFMaterial*& lastMat, LLViewerTexture*& lastTex)
 {
     if (uploadMatrixPalette(params.mAvatar, params.mSkinInfo, lastAvatar, lastMeshId, skipLastSkin))
     {
-        pushGLTFBatch(params);
+        pushGLTFBatch(params, lastMat, lastTex);
     }
 }
 
diff --git a/indra/newview/lldrawpool.h b/indra/newview/lldrawpool.h
index 46696fc4a4..c645565f06 100644
--- a/indra/newview/lldrawpool.h
+++ b/indra/newview/lldrawpool.h
@@ -40,6 +40,7 @@ class LLDrawInfo;
 class LLVOAvatar;
 class LLGLSLShader;
 class LLMeshSkinInfo;
+class LLFetchedGLTFMaterial;
 
 class LLDrawPool
 {
@@ -376,8 +377,10 @@ class LLRenderPass : public LLDrawPool
     void pushUntexturedRiggedGLTFBatches(U32 type);
 
     // push a single GLTF draw call
-    static void pushGLTFBatch(LLDrawInfo& params);
-    static void pushRiggedGLTFBatch(LLDrawInfo& params, const LLVOAvatar*& lastAvatar, U64& lastMeshId, bool& skipLastSkin);
+    // lastMat/lastTex track the most recently bound material+media texture so
+    // consecutive draws sharing a material skip the redundant LLFetchedGLTFMaterial::bind
+    static void pushGLTFBatch(LLDrawInfo& params, LLFetchedGLTFMaterial*& lastMat, LLViewerTexture*& lastTex);
+    static void pushRiggedGLTFBatch(LLDrawInfo& params, const LLVOAvatar*& lastAvatar, U64& lastMeshId, bool& skipLastSkin, LLFetchedGLTFMaterial*& lastMat, LLViewerTexture*& lastTex);
     static void pushUntexturedGLTFBatch(LLDrawInfo& params);
     static void pushUntexturedRiggedGLTFBatch(LLDrawInfo& params, const LLVOAvatar*& lastAvatar, U64& lastMeshId, bool& skipLastSkin);
 
diff --git a/indra/newview/lldrawpoolavatar.cpp b/indra/newview/lldrawpoolavatar.cpp
index 89d85dfa2a..1cca08c20b 100644
--- a/indra/newview/lldrawpoolavatar.cpp
+++ b/indra/newview/lldrawpoolavatar.cpp
@@ -397,6 +397,18 @@ void LLDrawPoolAvatar::renderShadow(S32 pass)
         return;
     }
 
+    // Optionally skip the costlier avatar shadow passes (alpha blend is the most
+    // expensive and least visually important; alpha mask next). Default 2 = full.
+    static LLCachedControl<S32> avatar_shadow_detail(gSavedSettings, "RenderAvatarShadowDetail", 2);
+    if (pass == SHADOW_PASS_AVATAR_ALPHA_BLEND && avatar_shadow_detail() < 2)
+    {
+        return;
+    }
+    if (pass == SHADOW_PASS_AVATAR_ALPHA_MASK && avatar_shadow_detail() < 1)
+    {
+        return;
+    }
+
     LLDrawPoolAvatar::sShadowPass = pass;
 
     if (pass == SHADOW_PASS_AVATAR_OPAQUE)
diff --git a/indra/newview/lldrawpoolbump.cpp b/indra/newview/lldrawpoolbump.cpp
index a07eeb1bed..b1d009afac 100644
--- a/indra/newview/lldrawpoolbump.cpp
+++ b/indra/newview/lldrawpoolbump.cpp
@@ -546,28 +546,51 @@ void LLDrawPoolBump::renderDeferred(S32 pass)
     for (int i = 0; i < 2; ++i)
     {
         bool rigged = i == 1;
+
+        U32 type = rigged ? LLRenderPass::PASS_BUMP_RIGGED : LLRenderPass::PASS_BUMP;
+        LLCullResult::drawinfo_iterator begin = gPipeline.beginRenderMap(type);
+        LLCullResult::drawinfo_iterator end = gPipeline.endRenderMap(type);
+        if (begin == end)
+        {   // no bump geometry in this pass -- skip the shader bind and texture setup
+            continue;
+        }
+
         gDeferredBumpProgram.bind(rigged);
         diffuse_channel = LLGLSLShader::sCurBoundShaderPtr->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
         bump_channel = LLGLSLShader::sCurBoundShaderPtr->enableTexture(LLViewerShaderMgr::BUMP_MAP);
         gGL.getTexUnit(diffuse_channel)->unbind(LLTexUnit::TT_TEXTURE);
         gGL.getTexUnit(bump_channel)->unbind(LLTexUnit::TT_TEXTURE);
 
-        U32 type = rigged ? LLRenderPass::PASS_BUMP_RIGGED : LLRenderPass::PASS_BUMP;
-        LLCullResult::drawinfo_iterator begin = gPipeline.beginRenderMap(type);
-        LLCullResult::drawinfo_iterator end = gPipeline.endRenderMap(type);
-
         const LLVOAvatar* lastAvatar = nullptr;
         U64 lastMeshId = 0;
         bool skipLastSkin = false;
 
+        // Faces are sorted by bumpmap then texture, so the alpha-mask cutoff and the
+        // bump-image bind (an image lookup + texture bind) repeat across runs of faces.
+        // Skip them when unchanged. (bindBumpMap's only side effect, addTextureStats, is
+        // max-based on the source texture, so skipping a repeat is a no-op there too.)
+        U8 lastBump = 255;
+        LLViewerTexture* lastBumpTex = nullptr;
+        F32 lastAlpha = -1.f;
+
         for (LLCullResult::drawinfo_iterator i = begin; i != end; )
         {
             LLDrawInfo& params = **i;
 
             LLCullResult::increment_iterator(i, end);
 
-            LLGLSLShader::sCurBoundShaderPtr->setMinimumAlpha(params.mAlphaMaskCutoff);
-            LLDrawPoolBump::bindBumpMap(params, bump_channel);
+            if (params.mAlphaMaskCutoff != lastAlpha)
+            {
+                lastAlpha = params.mAlphaMaskCutoff;
+                LLGLSLShader::sCurBoundShaderPtr->setMinimumAlpha(lastAlpha);
+            }
+
+            if (params.mBump != lastBump || params.mTexture.get() != lastBumpTex)
+            {
+                lastBump = params.mBump;
+                lastBumpTex = params.mTexture.get();
+                LLDrawPoolBump::bindBumpMap(params, bump_channel);
+            }
 
             if (rigged)
             {
diff --git a/indra/newview/lldrawpoolmaterials.cpp b/indra/newview/lldrawpoolmaterials.cpp
index e7ec2022d2..b330785811 100644
--- a/indra/newview/lldrawpoolmaterials.cpp
+++ b/indra/newview/lldrawpoolmaterials.cpp
@@ -50,10 +50,46 @@ S32 LLDrawPoolMaterials::getNumDeferredPasses()
     return 12*2;
 }
 
+// Render-map pass type for each non-rigged material pass; rigged passes use type + 1.
+// Kept in sync with the shader index table in beginDeferredPass and the type list in
+// renderDeferred (which now indexes this same array).
+static const U32 sMaterialPassType[] =
+{
+    LLRenderPass::PASS_MATERIAL,
+    LLRenderPass::PASS_MATERIAL_ALPHA_MASK,
+    LLRenderPass::PASS_MATERIAL_ALPHA_EMISSIVE,
+    LLRenderPass::PASS_SPECMAP,
+    LLRenderPass::PASS_SPECMAP_MASK,
+    LLRenderPass::PASS_SPECMAP_EMISSIVE,
+    LLRenderPass::PASS_NORMMAP,
+    LLRenderPass::PASS_NORMMAP_MASK,
+    LLRenderPass::PASS_NORMMAP_EMISSIVE,
+    LLRenderPass::PASS_NORMSPEC,
+    LLRenderPass::PASS_NORMSPEC_MASK,
+    LLRenderPass::PASS_NORMSPEC_EMISSIVE,
+};
+
+bool LLDrawPoolMaterials::isPassEmpty(S32 pass)
+{
+    bool rigged = false;
+    if (pass >= 12)
+    {
+        rigged = true;
+        pass -= 12;
+    }
+    U32 type = sMaterialPassType[pass] + (rigged ? 1 : 0);
+    return gPipeline.beginRenderMap(type) == gPipeline.endRenderMap(type);
+}
+
 void LLDrawPoolMaterials::beginDeferredPass(S32 pass)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_MATERIAL;
 
+    if (isPassEmpty(pass))
+    {   // nothing to draw this pass -- skip the (costly) deferred shader bind
+        return;
+    }
+
     bool rigged = false;
     if (pass >= 12)
     {
@@ -97,7 +133,10 @@ void LLDrawPoolMaterials::endDeferredPass(S32 pass)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_MATERIAL;
 
-    mShader->unbind();
+    if (!isPassEmpty(pass))
+    {   // only unbind if beginDeferredPass actually bound a shader for this pass
+        mShader->unbind();
+    }
 
     LLRenderPass::endRenderPass(pass);
 }
@@ -105,25 +144,11 @@ void LLDrawPoolMaterials::endDeferredPass(S32 pass)
 void LLDrawPoolMaterials::renderDeferred(S32 pass)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_MATERIAL;
-    static const U32 type_list[] =
-    {
-        LLRenderPass::PASS_MATERIAL,
-        //LLRenderPass::PASS_MATERIAL_ALPHA,
-        LLRenderPass::PASS_MATERIAL_ALPHA_MASK,
-        LLRenderPass::PASS_MATERIAL_ALPHA_EMISSIVE,
-        LLRenderPass::PASS_SPECMAP,
-        //LLRenderPass::PASS_SPECMAP_BLEND,
-        LLRenderPass::PASS_SPECMAP_MASK,
-        LLRenderPass::PASS_SPECMAP_EMISSIVE,
-        LLRenderPass::PASS_NORMMAP,
-        //LLRenderPass::PASS_NORMMAP_BLEND,
-        LLRenderPass::PASS_NORMMAP_MASK,
-        LLRenderPass::PASS_NORMMAP_EMISSIVE,
-        LLRenderPass::PASS_NORMSPEC,
-        //LLRenderPass::PASS_NORMSPEC_BLEND,
-        LLRenderPass::PASS_NORMSPEC_MASK,
-        LLRenderPass::PASS_NORMSPEC_EMISSIVE,
-    };
+
+    if (isPassEmpty(pass))
+    {   // beginDeferredPass skipped the bind for this empty pass; nothing to draw
+        return;
+    }
 
     bool rigged = false;
     if (pass >= 12)
@@ -132,9 +157,9 @@ void LLDrawPoolMaterials::renderDeferred(S32 pass)
         pass -= 12;
     }
 
-    llassert(pass < sizeof(type_list)/sizeof(U32));
+    llassert(pass < sizeof(sMaterialPassType)/sizeof(U32));
 
-    U32 type = type_list[pass];
+    U32 type = sMaterialPassType[pass];
     if (rigged)
     {
         type += 1;
diff --git a/indra/newview/lldrawpoolmaterials.h b/indra/newview/lldrawpoolmaterials.h
index 345697ffd1..5b10a6cd4f 100644
--- a/indra/newview/lldrawpoolmaterials.h
+++ b/indra/newview/lldrawpoolmaterials.h
@@ -41,6 +41,11 @@ class LLGLSLShader;
 class LLDrawPoolMaterials : public LLRenderPass
 {
     LLGLSLShader *mShader;
+
+    // True when this pass's render map is empty (no geometry). begin/render/end all
+    // consult it so an empty pass skips the deferred shader bind/unbind entirely --
+    // modern PBR/simple scenes leave most of the 12 legacy material passes empty.
+    bool isPassEmpty(S32 pass);
 public:
     LLDrawPoolMaterials();
 
diff --git a/indra/newview/lldrawpoolwater.cpp b/indra/newview/lldrawpoolwater.cpp
index d4dd4d1214..01fe2840a4 100644
--- a/indra/newview/lldrawpoolwater.cpp
+++ b/indra/newview/lldrawpoolwater.cpp
@@ -116,22 +116,10 @@ void LLDrawPoolWater::beginPostDeferredPass(S32 pass)
         LLGLDepthTest depth(GL_TRUE, GL_TRUE, GL_ALWAYS);
 
         LLRenderTarget& src = gPipeline.mRT->screen;
-        LLRenderTarget& depth_src = gPipeline.mRT->deferredScreen;
         LLRenderTarget& dst = gPipeline.mWaterDis;
 
-        dst.bindTarget();
-        gCopyDepthProgram.bind();
-
-        S32 diff_map = gCopyDepthProgram.getTextureChannel(LLShaderMgr::DIFFUSE_MAP);
-        S32 depth_map = gCopyDepthProgram.getTextureChannel(LLShaderMgr::DEFERRED_DEPTH);
-
-        gGL.getTexUnit(diff_map)->bind(&src);
-        gGL.getTexUnit(depth_map)->bind(&depth_src, true);
-
-        gPipeline.mScreenTriangleVB->setBuffer();
-        gPipeline.mScreenTriangleVB->drawArrays(LLRender::TRIANGLES, 0, 3);
-
-        dst.flush();
+        dst.copyContents(src, 0, 0, src.getWidth(), src.getHeight(), 0, 0, dst.getWidth(), dst.getHeight(),
+                    GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT, GL_NEAREST);
     }
 }
 
diff --git a/indra/newview/llgltfmaterialpreviewmgr.cpp b/indra/newview/llgltfmaterialpreviewmgr.cpp
index c49b751982..11499b15fb 100644
--- a/indra/newview/llgltfmaterialpreviewmgr.cpp
+++ b/indra/newview/llgltfmaterialpreviewmgr.cpp
@@ -510,9 +510,11 @@ bool LLGLTFPreviewTexture::render()
         gPipeline.bindDeferredShader(shader);
         fixup_shader_constants(shader);
 
+        LLFetchedGLTFMaterial* lastMat = nullptr;
+        LLViewerTexture* lastTex = nullptr;
         for (PreviewSpherePart& part : preview_sphere)
         {
-            LLRenderPass::pushGLTFBatch(*part->mDrawInfo);
+            LLRenderPass::pushGLTFBatch(*part->mDrawInfo, lastMat, lastTex);
         }
 
         gPipeline.unbindDeferredShader(shader);
diff --git a/indra/newview/llviewershadermgr.cpp b/indra/newview/llviewershadermgr.cpp
index 69d981c7c1..e1b51e05f2 100644
--- a/indra/newview/llviewershadermgr.cpp
+++ b/indra/newview/llviewershadermgr.cpp
@@ -3566,18 +3566,6 @@ bool LLViewerShaderMgr::loadShadersInterface()
         success = gCopyProgram.createShader();
     }
 
-    if (success)
-    {
-        gCopyDepthProgram.mName = "Copy Depth Shader";
-        gCopyDepthProgram.mShaderFiles.clear();
-        gCopyDepthProgram.mShaderFiles.push_back(make_pair("interface/copyV.glsl", GL_VERTEX_SHADER));
-        gCopyDepthProgram.mShaderFiles.push_back(make_pair("interface/copyF.glsl", GL_FRAGMENT_SHADER));
-        gCopyDepthProgram.clearPermutations();
-        gCopyDepthProgram.addPermutation("COPY_DEPTH", "1");
-        gCopyDepthProgram.mShaderLevel = mShaderLevel[SHADER_INTERFACE];
-        success = gCopyDepthProgram.createShader();
-    }
-
     if (success)
     {
         gDrawColorProgram.mName = "Draw Color Shader";
diff --git a/indra/newview/llviewershadermgr.h b/indra/newview/llviewershadermgr.h
index 903bdb3d35..75bedaf985 100644
--- a/indra/newview/llviewershadermgr.h
+++ b/indra/newview/llviewershadermgr.h
@@ -176,7 +176,6 @@ extern LLGLSLShader         gClipProgram;
 extern LLGLSLShader         gBenchmarkProgram;
 extern LLGLSLShader         gReflectionProbeDisplayProgram;
 extern LLGLSLShader         gCopyProgram;
-extern LLGLSLShader         gCopyDepthProgram;
 extern LLGLSLShader         gPBRTerrainBakeProgram;
 extern LLGLSLShader         gDrawColorProgram;
 
diff --git a/indra/newview/llviewertexture.cpp b/indra/newview/llviewertexture.cpp
index 41c7f9cd01..f7baafbdd1 100644
--- a/indra/newview/llviewertexture.cpp
+++ b/indra/newview/llviewertexture.cpp
@@ -3210,8 +3210,6 @@ void LLViewerLODTexture::processTextureStats()
     }
 }
 
-extern LLGLSLShader gCopyProgram;
-
 bool LLViewerLODTexture::scaleDown()
 {
     if (mGLTexturep.isNull() || !mGLTexturep->getHasGLTexture())
diff --git a/indra/newview/llvovolume.cpp b/indra/newview/llvovolume.cpp
index 8a38615614..a1580dc711 100644
--- a/indra/newview/llvovolume.cpp
+++ b/indra/newview/llvovolume.cpp
@@ -6248,6 +6248,16 @@ struct CompareBatchBreaker
         const LLTextureEntry* lte = lhs->getTextureEntry();
         const LLTextureEntry* rte = rhs->getTextureEntry();
 
+        // Group faces sharing a GLTF material so the PBR push loop can skip
+        // redundant LLFetchedGLTFMaterial::bind calls (see LLRenderPass::pushGLTFBatch).
+        // Non-PBR faces have a null render material, so this is a no-op for them.
+        const LLGLTFMaterial* lgltf = lte->getGLTFRenderMaterial();
+        const LLGLTFMaterial* rgltf = rte->getGLTFRenderMaterial();
+        if (lgltf != rgltf)
+        {
+            return lgltf < rgltf;
+        }
+
         if (lte->getBumpmap() != rte->getBumpmap())
         {
             return lte->getBumpmap() < rte->getBumpmap();
diff --git a/indra/newview/pipeline.cpp b/indra/newview/pipeline.cpp
index 48af355dc1..ac0aba85e0 100644
--- a/indra/newview/pipeline.cpp
+++ b/indra/newview/pipeline.cpp
@@ -2953,26 +2953,6 @@ void LLPipeline::doOcclusion(LLCamera& camera)
         gGL.setColorMask(true, true);
     }
 
-    if (sReflectionProbesEnabled && sUseOcclusion > 1 && !LLPipeline::sShadowRender && !gCubeSnapshot)
-    {
-        gGL.setColorMask(false, false);
-        LLGLDepthTest depth(GL_TRUE, GL_FALSE);
-        LLGLDisable cull(GL_CULL_FACE);
-
-        gOcclusionCubeProgram.bind();
-
-        if (mCubeVB.isNull())
-        { //cube VB will be used for issuing occlusion queries
-            mCubeVB = ll_create_cube_vb(LLVertexBuffer::MAP_VERTEX);
-        }
-        mCubeVB->setBuffer();
-
-        mHeroProbeManager.doOcclusion();
-        gOcclusionCubeProgram.unbind();
-
-        gGL.setColorMask(true, true);
-    }
-
     if (LLPipeline::sUseOcclusion > 1 &&
         (sCull->hasOcclusionGroups() || LLVOCachePartition::sNeedsOcclusionCheck))
     {
@@ -7091,6 +7071,10 @@ void LLPipeline::renderAlphaObjects(bool rigged)
     const LLVOAvatar* lastAvatarGLTF = nullptr;
     U64 lastMeshIdGLTF = 0;
     bool skipLastSkinGLTF;
+    // GLTF material bind cache; invalidated in the non-GLTF branches below since
+    // mSimplePool->pushBatch rebinds texture units and would clobber the material
+    LLFetchedGLTFMaterial* lastMatGLTF = nullptr;
+    LLViewerTexture* lastTexGLTF = nullptr;
     auto* begin = gPipeline.beginRenderMap(type);
     auto* end = gPipeline.endRenderMap(type);
 
@@ -7114,7 +7098,7 @@ void LLPipeline::renderAlphaObjects(bool rigged)
                 LLGLSLShader::sCurBoundShaderPtr->uniform1i(LLShaderMgr::SUN_UP_FACTOR, sun_up);
                 LLGLSLShader::sCurBoundShaderPtr->uniform1f(LLShaderMgr::DEFERRED_SHADOW_TARGET_WIDTH, (float)target_width);
                 LLGLSLShader::sCurBoundShaderPtr->setMinimumAlpha(ALPHA_BLEND_CUTOFF);
-                LLRenderPass::pushRiggedGLTFBatch(*pparams, lastAvatarGLTF, lastMeshIdGLTF, skipLastSkinGLTF);
+                LLRenderPass::pushRiggedGLTFBatch(*pparams, lastAvatarGLTF, lastMeshIdGLTF, skipLastSkinGLTF, lastMatGLTF, lastTexGLTF);
             }
             else
             {
@@ -7122,6 +7106,8 @@ void LLPipeline::renderAlphaObjects(bool rigged)
                 LLGLSLShader::sCurBoundShaderPtr->uniform1i(LLShaderMgr::SUN_UP_FACTOR, sun_up);
                 LLGLSLShader::sCurBoundShaderPtr->uniform1f(LLShaderMgr::DEFERRED_SHADOW_TARGET_WIDTH, (float)target_width);
                 LLGLSLShader::sCurBoundShaderPtr->setMinimumAlpha(ALPHA_BLEND_CUTOFF);
+                lastMatGLTF = nullptr; // pushBatch clobbers texture units
+                lastTexGLTF = nullptr;
                 if (mSimplePool->uploadMatrixPalette(pparams->mAvatar, pparams->mSkinInfo, lastAvatar, lastMeshId, skipLastSkin))
                 {
                     mSimplePool->pushBatch(*pparams, true, true);
@@ -7136,7 +7122,7 @@ void LLPipeline::renderAlphaObjects(bool rigged)
                 LLGLSLShader::sCurBoundShaderPtr->uniform1i(LLShaderMgr::SUN_UP_FACTOR, sun_up);
                 LLGLSLShader::sCurBoundShaderPtr->uniform1f(LLShaderMgr::DEFERRED_SHADOW_TARGET_WIDTH, (float)target_width);
                 LLGLSLShader::sCurBoundShaderPtr->setMinimumAlpha(ALPHA_BLEND_CUTOFF);
-                LLRenderPass::pushGLTFBatch(*pparams);
+                LLRenderPass::pushGLTFBatch(*pparams, lastMatGLTF, lastTexGLTF);
             }
             else
             {
@@ -7144,6 +7130,8 @@ void LLPipeline::renderAlphaObjects(bool rigged)
                 LLGLSLShader::sCurBoundShaderPtr->uniform1i(LLShaderMgr::SUN_UP_FACTOR, sun_up);
                 LLGLSLShader::sCurBoundShaderPtr->uniform1f(LLShaderMgr::DEFERRED_SHADOW_TARGET_WIDTH, (float)target_width);
                 LLGLSLShader::sCurBoundShaderPtr->setMinimumAlpha(ALPHA_BLEND_CUTOFF);
+                lastMatGLTF = nullptr; // pushBatch clobbers texture units
+                lastTexGLTF = nullptr;
                 mSimplePool->pushBatch(*pparams, true, true);
             }
         }
@@ -7324,14 +7312,8 @@ void LLPipeline::generateExposure(LLRenderTarget* src, LLRenderTarget* dst, bool
         if (use_history)
         {
             // copy last frame's exposure into mLastExposure
-            mLastExposure.bindTarget();
-            gCopyProgram.bind();
-            gGL.getTexUnit(0)->bind(dst);
-
-            mScreenTriangleVB->setBuffer();
-            mScreenTriangleVB->drawArrays(LLRender::TRIANGLES, 0, 3);
-
-            mLastExposure.flush();
+            mLastExposure.copyContents(*dst, 0, 0, dst->getWidth(), dst->getHeight(), 0, 0, mLastExposure.getWidth(), mLastExposure.getHeight(),
+                             GL_COLOR_BUFFER_BIT, GL_NEAREST);
         }
 
         dst->bindTarget();
@@ -7989,23 +7971,8 @@ void LLPipeline::copyScreenSpaceReflections(LLRenderTarget* src, LLRenderTarget*
     {
         LL_PROFILE_GPU_ZONE("ssr copy");
         LLGLDepthTest depth(GL_TRUE, GL_TRUE, GL_ALWAYS);
-
-        LLRenderTarget& depth_src = mRT->deferredScreen;
-
-        dst->bindTarget();
-        dst->clear();
-        gCopyDepthProgram.bind();
-
-        S32 diff_map = gCopyDepthProgram.getTextureChannel(LLShaderMgr::DIFFUSE_MAP);
-        S32 depth_map = gCopyDepthProgram.getTextureChannel(LLShaderMgr::DEFERRED_DEPTH);
-
-        gGL.getTexUnit(diff_map)->bind(src);
-        gGL.getTexUnit(depth_map)->bind(&depth_src, true);
-
-        mScreenTriangleVB->setBuffer();
-        mScreenTriangleVB->drawArrays(LLRender::TRIANGLES, 0, 3);
-
-        dst->flush();
+        dst->copyContents(*src, 0, 0, src->getWidth(), src->getHeight(), 0, 0, dst->getWidth(), dst->getHeight(),
+                         GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT, GL_NEAREST);
     }
 }
 
@@ -9906,24 +9873,11 @@ void LLPipeline::doAtmospherics()
             LLGLDepthTest depth(GL_TRUE, GL_TRUE, GL_ALWAYS);
 
             LLRenderTarget& src = gPipeline.mRT->screen;
-            LLRenderTarget& depth_src = gPipeline.mRT->deferredScreen;
             LLRenderTarget& dst = gPipeline.mWaterDis;
 
-            mRT->screen.flush();
-            dst.bindTarget();
-            gCopyDepthProgram.bind();
-
-            S32 diff_map = gCopyDepthProgram.getTextureChannel(LLShaderMgr::DIFFUSE_MAP);
-            S32 depth_map = gCopyDepthProgram.getTextureChannel(LLShaderMgr::DEFERRED_DEPTH);
-
-            gGL.getTexUnit(diff_map)->bind(&src);
-            gGL.getTexUnit(depth_map)->bind(&depth_src, true);
-
-            gGL.setColorMask(false, false);
-            gPipeline.mScreenTriangleVB->setBuffer();
-            gPipeline.mScreenTriangleVB->drawArrays(LLRender::TRIANGLES, 0, 3);
-
-            dst.flush();
+            src.flush();
+            dst.copyContents(src, 0, 0, src.getWidth(), src.getHeight(), 0, 0, dst.getWidth(), dst.getHeight(),
+                             GL_DEPTH_BUFFER_BIT, GL_NEAREST);
             mRT->screen.bindTarget();
         }
 
@@ -9970,24 +9924,11 @@ void LLPipeline::doWaterHaze()
             LLGLDepthTest depth(GL_TRUE, GL_TRUE, GL_ALWAYS);
 
             LLRenderTarget& src = gPipeline.mRT->screen;
-            LLRenderTarget& depth_src = gPipeline.mRT->deferredScreen;
             LLRenderTarget& dst = gPipeline.mWaterDis;
 
-            mRT->screen.flush();
-            dst.bindTarget();
-            gCopyDepthProgram.bind();
-
-            S32 diff_map = gCopyDepthProgram.getTextureChannel(LLShaderMgr::DIFFUSE_MAP);
-            S32 depth_map = gCopyDepthProgram.getTextureChannel(LLShaderMgr::DEFERRED_DEPTH);
-
-            gGL.getTexUnit(diff_map)->bind(&src);
-            gGL.getTexUnit(depth_map)->bind(&depth_src, true);
-
-            gGL.setColorMask(false, false);
-            gPipeline.mScreenTriangleVB->setBuffer();
-            gPipeline.mScreenTriangleVB->drawArrays(LLRender::TRIANGLES, 0, 3);
-
-            dst.flush();
+            src.flush();
+            dst.copyContents(src, 0, 0, src.getWidth(), src.getHeight(), 0, 0, dst.getWidth(), dst.getHeight(),
+                    GL_DEPTH_BUFFER_BIT, GL_NEAREST);
             mRT->screen.bindTarget();
         }
 
@@ -10403,7 +10344,7 @@ static LLTrace::BlockTimerStatHandle FTM_SHADOW_ALPHA_TREE("Alpha Tree");
 static LLTrace::BlockTimerStatHandle FTM_SHADOW_ALPHA_GRASS("Alpha Grass");
 static LLTrace::BlockTimerStatHandle FTM_SHADOW_FULLBRIGHT_ALPHA_MASKED("Fullbright Alpha Masked");
 
-void LLPipeline::renderShadow(const glm::mat4& view, const glm::mat4& proj, LLCamera& shadow_cam, LLCullResult& result, bool depth_clamp)
+void LLPipeline::renderShadow(const glm::mat4& view, const glm::mat4& proj, LLCamera& shadow_cam, LLCullResult& result, bool depth_clamp, bool do_cull)
 {
     LL_PROFILE_ZONE_SCOPED_CATEGORY_PIPELINE; //LL_RECORD_BLOCK_TIME(FTM_SHADOW_RENDER);
     LL_PROFILE_GPU_ZONE("renderShadow");
@@ -10439,7 +10380,13 @@ void LLPipeline::renderShadow(const glm::mat4& view, const glm::mat4& proj, LLCa
 
     LLGLDepthTest depth_test(GL_TRUE, GL_TRUE, GL_LESS);
 
-    updateCull(shadow_cam, result);
+    // In RenderShadowCullMode 1, do_cull is false: generateSunShadow did the single union
+    // octree cull and pre-filtered `result` to this cascade's frustum (bucketShadowCull),
+    // so skip the per-cascade octree walk and only sort/build this cascade's render map.
+    if (do_cull)
+    {
+        updateCull(shadow_cam, result);
+    }
 
     stateSort(shadow_cam, result);
 
@@ -10837,6 +10784,49 @@ class LLDisableOcclusionCulling
     }
 };
 
+// Re-bucket a shared sun-shadow cull (produced by a single union octree walk) down to one
+// cascade: copy the union's visible/drawable groups whose object bounds intersect this
+// cascade's frustum into `dst` (the same AABBInFrustumObjectBounds test the per-cascade
+// cull uses, so the geometry matches mode 0 exactly), then pass the small individual-
+// drawable and bridge lists through unfiltered. stateSort then builds the cascade's render
+// map from `dst`. Lets RenderShadowCullMode 1 share one octree walk across all cascades.
+static void bucketShadowCull(LLCullResult& src, LLCamera& cam, LLCullResult& dst)
+{
+    dst.clear();
+
+    for (LLCullResult::sg_iterator i = src.beginVisibleGroups(), end = src.endVisibleGroups(); i != end; ++i)
+    {
+        LLSpatialGroup* group = *i;
+        if (!group->isDead() &&
+            cam.AABBInFrustum(group->getObjectBounds()[0], group->getObjectBounds()[1]) > 0)
+        {
+            dst.pushVisibleGroup(group);
+        }
+    }
+
+    for (LLCullResult::sg_iterator i = src.beginDrawableGroups(), end = src.endDrawableGroups(); i != end; ++i)
+    {
+        LLSpatialGroup* group = *i;
+        if (!group->isDead() &&
+            cam.AABBInFrustum(group->getObjectBounds()[0], group->getObjectBounds()[1]) > 0)
+        {
+            dst.pushDrawableGroup(group);
+        }
+    }
+
+    // Individual drawables and spatial bridges (attachments/animesh) are few; pass them
+    // through unfiltered -- conservative (they render into every cascade) but correct.
+    for (LLCullResult::drawable_iterator i = src.beginVisibleList(), end = src.endVisibleList(); i != end; ++i)
+    {
+        dst.pushDrawable(*i);
+    }
+
+    for (LLCullResult::bridge_iterator i = src.beginVisibleBridge(), end = src.endVisibleBridge(); i != end; ++i)
+    {
+        dst.pushBridge(*i);
+    }
+}
+
 void LLPipeline::generateSunShadow(LLCamera& camera)
 {
     if (!sRenderDeferred || RenderShadowDetail <= 0)
@@ -11070,6 +11060,105 @@ void LLPipeline::generateSunShadow(LLCamera& camera)
     }
     else
     {
+        // RenderShadowCullMode 1: do the expensive octree cull ONCE against a frustum
+        // spanning every sun cascade, then have each cascade cheaply re-bucket the union's
+        // visible groups by its own frustum (bucketShadowCull) and build its own render
+        // map. Saves 3 of 4 octree walks per frame while each cascade still renders only
+        // its own slice -- GPU-neutral vs. per-cascade culling, so it helps CPU-bound
+        // targets without regressing GPU-bound ones. Disabled in cube snapshots. Default 0.
+        static LLCachedControl<S32> sShadowCullMode(gSavedSettings, "RenderShadowCullMode", 0);
+        bool have_union_cull = false;
+        static LLCullResult sUnionShadowResult;
+        if (sShadowCullMode() == 1 && !gCubeSnapshot)
+        {
+            // updateFrustumPlanes below seeds the frustum corners from the *current* GL
+            // matrices, and earlier setup in this function leaves them in a non-main-view
+            // state. Restore the saved (main-view) matrices first, as the cascade loop
+            // does each iteration, so the corner directions used below are correct.
+            set_current_modelview(saved_view);
+            set_current_projection(saved_proj);
+
+            LLCamera ucam = camera;
+            ucam.setFar(16.f);
+            LLViewerCamera::updateFrustumPlanes(ucam, false, false, true);
+
+            LLVector3 ueye = camera.getOrigin();
+            LLVector3* ufrust = ucam.mAgentFrustum;
+            LLVector3 upn = ucam.getAtAxis();
+            for (U32 i = 0; i < 4; i++)
+            {
+                LLVector3 delta = ufrust[i+4]-ueye;
+                delta += (ufrust[i+4]-ufrust[(i+2)%4+4])*0.05f;
+                delta.normVec();
+                F32 dp = delta*upn;
+                ufrust[i]   = ueye + (delta*dist[0]*0.75f)/dp;
+                ufrust[i+4] = ueye + (delta*dist[4]*1.25f)/dp;
+            }
+
+            {
+                glm::mat4 uview = look(camera.getOrigin(), lightDir, -up);
+
+                // AABB the 8 full-range frustum corners directly in light space. ufrust
+                // spans [dist[0], dist[4]] (built above), so this box is a guaranteed
+                // superset of every cascade. getVisiblePointCloud is NOT usable here: the
+                // far corners sit past the view far plane, so it clips the cloud down to
+                // the 4 near corners and the union collapses to a dot at the camera.
+                LLVector3 mn(mul_mat4_vec3(uview, glm::vec3(ufrust[0])));
+                LLVector3 mx = mn;
+                for (U32 i = 1; i < 8; i++)
+                {
+                    LLVector3 p(mul_mat4_vec3(uview, glm::vec3(ufrust[i])));
+                    update_min_max(mn, mx, p);
+                }
+
+                LLVector3 ucenter = (mn+mx)*0.5f;
+
+                // Conservative ortho light-space projection bounding the whole point
+                // cloud. updateFrustumPlanes derives the cull frustum from the *current*
+                // GL modelview/projection, so set them here. Ortho is looser than the
+                // per-cascade perspective fit, so the result is a superset of every
+                // cascade frustum -- no dropped casters.
+                //
+                // Pad the depth range: with the sun near-overhead the light-space
+                // footprint is nearly planar (znear ~= zfar), which makes glm::ortho
+                // singular and updateFrustumPlanes unproject to NaN frustum corners --
+                // shadows then drop and flip with camera angle. The near plane is
+                // replaced by shadow_near_clip below and the far only needs to clear the
+                // receivers, so widening the depth range is always safe.
+                F32 zpad = llmax(mx.mV[0] - mn.mV[0], mx.mV[1] - mn.mV[1]) * 0.5f + 1.f;
+                glm::mat4 uproj = glm::ortho(mn.mV[0], mx.mV[0], mn.mV[1], mx.mV[1], -mx.mV[2] - zpad, -mn.mV[2] + zpad);
+
+                ucam.setOriginAndLookAt(ueye, up, ucenter);
+                ucam.setOrigin(0, 0, 0);
+
+                LLViewerCamera::sCurCameraID = LLViewerCamera::CAMERA_SUN_SHADOW0;
+                set_current_modelview(uview);
+                set_current_projection(uproj);
+                LLViewerCamera::updateFrustumPlanes(ucam, false, false, true);
+                ucam.getAgentPlane(LLCamera::AGENT_PLANE_NEAR).set(shadow_near_clip);
+
+                bool saved_shadow_render = LLPipeline::sShadowRender;
+                U32 saved_occlusion = sUseOcclusion;
+                LLPipeline::sShadowRender = true;
+                // Disable occlusion culling for the shadow cull exactly as renderShadow
+                // does: occlusion queries are main-camera and previous-frame based, so
+                // leaving them on wrongly culls casters hidden from the main view (their
+                // shadows still show) and flickers as the queries resolve frame to frame.
+                sUseOcclusion = 0;
+                // One octree walk for the whole sun shadow. No stateSort here -- each
+                // cascade re-buckets these visible groups and sorts its own render map.
+                updateCull(ucam, sUnionShadowResult);
+                sUseOcclusion = saved_occlusion;
+                LLPipeline::sShadowRender = saved_shadow_render;
+
+                // restore main matrices (the cascade loop sets its own each iteration)
+                set_current_modelview(saved_view);
+                set_current_projection(saved_proj);
+
+                have_union_cull = true;
+            }
+        }
+
         for (S32 j = 0; j < (gCubeSnapshot ? 2 : 4); j++)
         {
             if (!hasRenderDebugMask(RENDER_DEBUG_SHADOW_FRUSTA) && !gCubeSnapshot)
@@ -11430,7 +11519,11 @@ void LLPipeline::generateSunShadow(LLCamera& camera)
 
             {
                 static LLCullResult result[4];
-                renderShadow(view[j], proj[j], shadow_cam, result[j], true);
+                if (have_union_cull)
+                {   // re-bucket the shared union cull down to this cascade's frustum
+                    bucketShadowCull(sUnionShadowResult, shadow_cam, result[j]);
+                }
+                renderShadow(view[j], proj[j], shadow_cam, result[j], true, !have_union_cull);
             }
 
             mRT->shadow[j].flush();
diff --git a/indra/newview/pipeline.h b/indra/newview/pipeline.h
index 665b016ef0..f7cc8dd10c 100644
--- a/indra/newview/pipeline.h
+++ b/indra/newview/pipeline.h
@@ -353,7 +353,7 @@ class LLPipeline
 
     void renderHighlight(const LLViewerObject* obj, F32 fade);
 
-    void renderShadow(const glm::mat4& view, const glm::mat4& proj, LLCamera& camera, LLCullResult& result, bool depth_clamp);
+    void renderShadow(const glm::mat4& view, const glm::mat4& proj, LLCamera& camera, LLCullResult& result, bool depth_clamp, bool do_cull = true);
     void renderSelectedFaces(const LLColor4& color);
     void renderHighlights();
     void renderDebug();