diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs index 12785f3e78607..34933b1e47ce7 100644 --- a/crates/bevy_pbr/src/lib.rs +++ b/crates/bevy_pbr/src/lib.rs @@ -124,7 +124,7 @@ pub mod graph { use crate::{deferred::DeferredPbrLightingPlugin, graph::NodePbr}; use bevy_app::prelude::*; -use bevy_asset::{load_internal_asset, weak_handle, AssetApp, AssetPath, Assets, Handle}; +use bevy_asset::{AssetApp, AssetPath, Assets, Handle}; use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d}; use bevy_ecs::prelude::*; use bevy_image::Image; @@ -135,7 +135,7 @@ use bevy_render::{ extract_resource::ExtractResourcePlugin, load_shader_library, render_graph::RenderGraph, - render_resource::{Shader, ShaderRef}, + render_resource::ShaderRef, sync_component::SyncComponentPlugin, view::VisibilitySystems, ExtractSchedule, Render, RenderApp, RenderDebugFlags, RenderSystems, @@ -149,9 +149,6 @@ fn shader_ref(path: PathBuf) -> ShaderRef { ShaderRef::Path(AssetPath::from_path_buf(path).with_source("embedded")) } -const MESHLET_VISIBILITY_BUFFER_RESOLVE_SHADER_HANDLE: Handle = - weak_handle!("69187376-3dea-4d0f-b3f5-185bde63d6a2"); - pub const TONEMAPPING_LUT_TEXTURE_BINDING_INDEX: u32 = 26; pub const TONEMAPPING_LUT_SAMPLER_BINDING_INDEX: u32 = 27; @@ -205,12 +202,7 @@ impl Plugin for PbrPlugin { load_shader_library!(app, "render/view_transformations.wgsl"); // Setup dummy shaders for when MeshletPlugin is not used to prevent shader import errors. - load_internal_asset!( - app, - MESHLET_VISIBILITY_BUFFER_RESOLVE_SHADER_HANDLE, - "meshlet/dummy_visibility_buffer_resolve.wgsl", - Shader::from_wgsl - ); + load_shader_library!(app, "meshlet/dummy_visibility_buffer_resolve.wgsl"); app.register_asset_reflect::() .register_type::() diff --git a/crates/bevy_pbr/src/meshlet/asset.rs b/crates/bevy_pbr/src/meshlet/asset.rs index c158650d1bd4c..6910e6800f039 100644 --- a/crates/bevy_pbr/src/meshlet/asset.rs +++ b/crates/bevy_pbr/src/meshlet/asset.rs @@ -6,9 +6,9 @@ use bevy_asset::{ }; use bevy_math::{Vec2, Vec3}; use bevy_reflect::TypePath; +use bevy_render::render_resource::ShaderType; use bevy_tasks::block_on; use bytemuck::{Pod, Zeroable}; -use half::f16; use lz4_flex::frame::{FrameDecoder, FrameEncoder}; use std::io::{Read, Write}; use thiserror::Error; @@ -17,7 +17,7 @@ use thiserror::Error; const MESHLET_MESH_ASSET_MAGIC: u64 = 1717551717668; /// The current version of the [`MeshletMesh`] asset format. -pub const MESHLET_MESH_ASSET_VERSION: u64 = 1; +pub const MESHLET_MESH_ASSET_VERSION: u64 = 2; /// A mesh that has been pre-processed into multiple small clusters of triangles called meshlets. /// @@ -47,12 +47,32 @@ pub struct MeshletMesh { pub(crate) vertex_uvs: Arc<[Vec2]>, /// Triangle indices for meshlets. pub(crate) indices: Arc<[u8]>, + /// The BVH8 used for culling and LOD selection of the meshlets. The root is at index 0. + pub(crate) bvh: Arc<[BvhNode]>, /// The list of meshlets making up this mesh. pub(crate) meshlets: Arc<[Meshlet]>, /// Spherical bounding volumes. - pub(crate) meshlet_bounding_spheres: Arc<[MeshletBoundingSpheres]>, - /// Meshlet group and parent group simplification errors. - pub(crate) meshlet_simplification_errors: Arc<[MeshletSimplificationError]>, + pub(crate) meshlet_cull_data: Arc<[MeshletCullData]>, + /// The tight AABB of the meshlet mesh, used for frustum and occlusion culling at the instance + /// level. + pub(crate) aabb: MeshletAabb, + /// The depth of the culling BVH, used to determine the number of dispatches at runtime. + pub(crate) bvh_depth: u32, +} + +/// A single BVH8 node in the BVH used for culling and LOD selection of a [`MeshletMesh`]. +#[derive(Copy, Clone, Default, Pod, Zeroable)] +#[repr(C)] +pub struct BvhNode { + /// The tight AABBs of this node's children, used for frustum and occlusion during BVH + /// traversal. + pub aabbs: [MeshletAabbErrorOffset; 8], + /// The LOD bounding spheres of this node's children, used for LOD selection during BVH + /// traversal. + pub lod_bounds: [MeshletBoundingSphere; 8], + /// If `u8::MAX`, it indicates that the child of each children is a BVH node, otherwise it is the number of meshlets in the group. + pub child_counts: [u8; 8], + pub _padding: [u32; 2], } /// A single meshlet within a [`MeshletMesh`]. @@ -91,31 +111,37 @@ pub struct Meshlet { /// Bounding spheres used for culling and choosing level of detail for a [`Meshlet`]. #[derive(Copy, Clone, Pod, Zeroable)] #[repr(C)] -pub struct MeshletBoundingSpheres { - /// Bounding sphere used for frustum and occlusion culling for this meshlet. - pub culling_sphere: MeshletBoundingSphere, +pub struct MeshletCullData { + /// Tight bounding box, used for frustum and occlusion culling for this meshlet. + pub aabb: MeshletAabbErrorOffset, /// Bounding sphere used for determining if this meshlet's group is at the correct level of detail for a given view. pub lod_group_sphere: MeshletBoundingSphere, - /// Bounding sphere used for determining if this meshlet's parent group is at the correct level of detail for a given view. - pub lod_parent_group_sphere: MeshletBoundingSphere, } -/// A spherical bounding volume used for a [`Meshlet`]. -#[derive(Copy, Clone, Pod, Zeroable)] +/// An axis-aligned bounding box used for a [`Meshlet`]. +#[derive(Copy, Clone, Default, Pod, Zeroable, ShaderType)] #[repr(C)] -pub struct MeshletBoundingSphere { +pub struct MeshletAabb { pub center: Vec3, - pub radius: f32, + pub half_extent: Vec3, } -/// Simplification error used for choosing level of detail for a [`Meshlet`]. -#[derive(Copy, Clone, Pod, Zeroable)] +// An axis-aligned bounding box used for a [`Meshlet`]. +#[derive(Copy, Clone, Default, Pod, Zeroable, ShaderType)] #[repr(C)] -pub struct MeshletSimplificationError { - /// Simplification error used for determining if this meshlet's group is at the correct level of detail for a given view. - pub group_error: f16, - /// Simplification error used for determining if this meshlet's parent group is at the correct level of detail for a given view. - pub parent_group_error: f16, +pub struct MeshletAabbErrorOffset { + pub center: Vec3, + pub error: f32, + pub half_extent: Vec3, + pub child_offset: u32, +} + +/// A spherical bounding volume used for a [`Meshlet`]. +#[derive(Copy, Clone, Default, Pod, Zeroable)] +#[repr(C)] +pub struct MeshletBoundingSphere { + pub center: Vec3, + pub radius: f32, } /// An [`AssetSaver`] for `.meshlet_mesh` [`MeshletMesh`] assets. @@ -149,9 +175,11 @@ impl AssetSaver for MeshletMeshSaver { write_slice(&asset.vertex_normals, &mut writer)?; write_slice(&asset.vertex_uvs, &mut writer)?; write_slice(&asset.indices, &mut writer)?; + write_slice(&asset.bvh, &mut writer)?; write_slice(&asset.meshlets, &mut writer)?; - write_slice(&asset.meshlet_bounding_spheres, &mut writer)?; - write_slice(&asset.meshlet_simplification_errors, &mut writer)?; + write_slice(&asset.meshlet_cull_data, &mut writer)?; + writer.write_all(bytemuck::bytes_of(&asset.aabb))?; + writer.write_all(bytemuck::bytes_of(&asset.bvh_depth))?; writer.finish()?; Ok(()) @@ -190,18 +218,26 @@ impl AssetLoader for MeshletMeshLoader { let vertex_normals = read_slice(reader)?; let vertex_uvs = read_slice(reader)?; let indices = read_slice(reader)?; + let bvh = read_slice(reader)?; let meshlets = read_slice(reader)?; - let meshlet_bounding_spheres = read_slice(reader)?; - let meshlet_simplification_errors = read_slice(reader)?; + let meshlet_cull_data = read_slice(reader)?; + let mut bytes = [0u8; 24]; + reader.read_exact(&mut bytes)?; + let aabb = bytemuck::cast(bytes); + let mut bytes = [0u8; 4]; + reader.read_exact(&mut bytes)?; + let bvh_depth = u32::from_le_bytes(bytes); Ok(MeshletMesh { vertex_positions, vertex_normals, vertex_uvs, indices, + bvh, meshlets, - meshlet_bounding_spheres, - meshlet_simplification_errors, + meshlet_cull_data, + aabb, + bvh_depth, }) } diff --git a/crates/bevy_pbr/src/meshlet/cull_bvh.wgsl b/crates/bevy_pbr/src/meshlet/cull_bvh.wgsl new file mode 100644 index 0000000000000..f4d6be6bf8270 --- /dev/null +++ b/crates/bevy_pbr/src/meshlet/cull_bvh.wgsl @@ -0,0 +1,110 @@ +#import bevy_pbr::meshlet_bindings::{ + InstancedOffset, + get_aabb, + get_aabb_error, + get_aabb_child_offset, + constants, + meshlet_bvh_nodes, + meshlet_bvh_cull_count_read, + meshlet_bvh_cull_count_write, + meshlet_bvh_cull_dispatch, + meshlet_bvh_cull_queue, + meshlet_meshlet_cull_count_early, + meshlet_meshlet_cull_count_late, + meshlet_meshlet_cull_dispatch_early, + meshlet_meshlet_cull_dispatch_late, + meshlet_meshlet_cull_queue, + meshlet_second_pass_bvh_count, + meshlet_second_pass_bvh_dispatch, + meshlet_second_pass_bvh_queue, +} +#import bevy_pbr::meshlet_cull_shared::{ + lod_error_is_imperceptible, + aabb_in_frustum, + should_occlusion_cull_aabb, +} + +@compute +@workgroup_size(128, 1, 1) // 8 threads per node +fn cull_bvh(@builtin(global_invocation_id) global_invocation_id: vec3) { + // Calculate the queue ID for this thread + let dispatch_id = global_invocation_id.x; + var node = dispatch_id >> 3u; + let subnode = dispatch_id & 7u; + if node >= meshlet_bvh_cull_count_read { return; } + + node = select(node, constants.rightmost_slot - node, constants.read_from_front == 0u); + let instanced_offset = meshlet_bvh_cull_queue[node]; + let instance_id = instanced_offset.instance_id; + let bvh_node = &meshlet_bvh_nodes[instanced_offset.offset]; + + var aabb_error_offset = (*bvh_node).aabbs[subnode]; + let aabb = get_aabb(&aabb_error_offset); + let parent_error = get_aabb_error(&aabb_error_offset); + let lod_sphere = (*bvh_node).lod_bounds[subnode]; + + let parent_is_imperceptible = lod_error_is_imperceptible(lod_sphere, parent_error, instance_id); + // Error and frustum cull, in both passes + if parent_is_imperceptible || !aabb_in_frustum(aabb, instance_id) { return; } + + let child_offset = get_aabb_child_offset(&aabb_error_offset); + let index = subnode >> 2u; + let bit_offset = subnode & 3u; + let packed_child_count = (*bvh_node).child_counts[index]; + let child_count = extractBits(packed_child_count, bit_offset * 8u, 8u); + var value = InstancedOffset(instance_id, child_offset); + + // If we pass, try occlusion culling + // If this node was occluded, push it's children to the second pass to check against this frame's HZB + if should_occlusion_cull_aabb(aabb, instance_id) { +#ifdef MESHLET_FIRST_CULLING_PASS + if child_count == 255u { + let id = atomicAdd(&meshlet_second_pass_bvh_count, 1u); + meshlet_second_pass_bvh_queue[id] = value; + if ((id & 15u) == 0u) { + atomicAdd(&meshlet_second_pass_bvh_dispatch.x, 1u); + } + } else { + let base = atomicAdd(&meshlet_meshlet_cull_count_late, child_count); + let start = constants.rightmost_slot - base; + for (var i = start; i < start - child_count; i--) { + meshlet_meshlet_cull_queue[i] = value; + value.offset += 1u; + } + let req = (base + child_count + 127u) >> 7u; + atomicMax(&meshlet_meshlet_cull_dispatch_late.x, req); + } +#endif + return; + } + + // If we pass, push the children to the next BVH cull + if child_count == 255u { + let id = atomicAdd(&meshlet_bvh_cull_count_write, 1u); + let index = select(constants.rightmost_slot - id, id, constants.read_from_front == 0u); + meshlet_bvh_cull_queue[index] = value; + if ((id & 15u) == 0u) { + atomicAdd(&meshlet_bvh_cull_dispatch.x, 1u); + } + } else { +#ifdef MESHLET_FIRST_CULLING_PASS + let base = atomicAdd(&meshlet_meshlet_cull_count_early, child_count); + let end = base + child_count; + for (var i = base; i < end; i++) { + meshlet_meshlet_cull_queue[i] = value; + value.offset += 1u; + } + let req = (end + 127u) >> 7u; + atomicMax(&meshlet_meshlet_cull_dispatch_early.x, req); +#else + let base = atomicAdd(&meshlet_meshlet_cull_count_late, child_count); + let start = constants.rightmost_slot - base; + for (var i = start; i < start - child_count; i--) { + meshlet_meshlet_cull_queue[i] = value; + value.offset += 1u; + } + let req = (base + child_count + 127u) >> 7u; + atomicMax(&meshlet_meshlet_cull_dispatch_late.x, req); +#endif + } +} diff --git a/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl b/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl index 47f6dbb04b6be..85cbc0654d184 100644 --- a/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl +++ b/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl @@ -1,194 +1,93 @@ #import bevy_pbr::meshlet_bindings::{ - meshlet_cluster_meshlet_ids, - meshlet_bounding_spheres, - meshlet_simplification_errors, - meshlet_cluster_instance_ids, - meshlet_instance_uniforms, - meshlet_second_pass_candidates, - depth_pyramid, + InstancedOffset, + get_aabb, + get_aabb_error, + constants, view, - previous_view, - should_cull_instance, - cluster_is_second_pass_candidate, + meshlet_instance_uniforms, + meshlet_cull_data, meshlet_software_raster_indirect_args, meshlet_hardware_raster_indirect_args, + meshlet_previous_raster_counts, meshlet_raster_clusters, - constants, - MeshletBoundingSphere, + meshlet_meshlet_cull_count_read, + meshlet_meshlet_cull_count_write, + meshlet_meshlet_cull_dispatch, + meshlet_meshlet_cull_queue, +} +#import bevy_pbr::meshlet_cull_shared::{ + ScreenAabb, + project_aabb, + lod_error_is_imperceptible, + aabb_in_frustum, + should_occlusion_cull_aabb, } #import bevy_render::maths::affine3_to_square -/// Culls individual clusters (1 per thread) in two passes (two pass occlusion culling), and outputs a bitmask of which clusters survived. -/// 1. The first pass tests instance visibility, frustum culling, LOD selection, and finally occlusion culling using last frame's depth pyramid. -/// 2. The second pass performs occlusion culling (using the depth buffer generated from the first pass) on all clusters that passed -/// the instance, frustum, and LOD tests in the first pass, but were not visible last frame according to the occlusion culling. - @compute -@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1 cluster per thread -fn cull_clusters( - @builtin(workgroup_id) workgroup_id: vec3, - @builtin(num_workgroups) num_workgroups: vec3, - @builtin(local_invocation_index) local_invocation_index: u32, -) { - // Calculate the cluster ID for this thread - let cluster_id = local_invocation_index + 128u * dot(workgroup_id, vec3(num_workgroups.x * num_workgroups.x, num_workgroups.x, 1u)); - if cluster_id >= constants.scene_cluster_count { return; } +@workgroup_size(128, 1, 1) // 1 cluster per thread +fn cull_clusters(@builtin(global_invocation_id) global_invocation_id: vec3) { + if global_invocation_id.x >= meshlet_meshlet_cull_count_read { return; } -#ifdef MESHLET_SECOND_CULLING_PASS - if !cluster_is_second_pass_candidate(cluster_id) { return; } -#endif - - // Check for instance culling - let instance_id = meshlet_cluster_instance_ids[cluster_id]; #ifdef MESHLET_FIRST_CULLING_PASS - if should_cull_instance(instance_id) { return; } + let meshlet_id = global_invocation_id.x; +#else + let meshlet_id = constants.rightmost_slot - global_invocation_id.x; #endif - - // Calculate world-space culling bounding sphere for the cluster - let instance_uniform = meshlet_instance_uniforms[instance_id]; - let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id]; - let world_from_local = affine3_to_square(instance_uniform.world_from_local); - let world_scale = max(length(world_from_local[0]), max(length(world_from_local[1]), length(world_from_local[2]))); - let bounding_spheres = meshlet_bounding_spheres[meshlet_id]; - let culling_bounding_sphere_center = world_from_local * vec4(bounding_spheres.culling_sphere.center, 1.0); - let culling_bounding_sphere_radius = world_scale * bounding_spheres.culling_sphere.radius; - + let instanced_offset = meshlet_meshlet_cull_queue[meshlet_id]; + let instance_id = instanced_offset.instance_id; + let cull_data = &meshlet_cull_data[instanced_offset.offset]; + var aabb_error_offset = (*cull_data).aabb; + let aabb = get_aabb(&aabb_error_offset); + let error = get_aabb_error(&aabb_error_offset); + let lod_sphere = (*cull_data).lod_group_sphere; + + let is_imperceptible = lod_error_is_imperceptible(lod_sphere, error, instance_id); + // Error and frustum cull, in both passes + if !is_imperceptible || !aabb_in_frustum(aabb, instance_id) { return; } + + // If we pass, try occlusion culling + // If this node was occluded, push it's children to the second pass to check against this frame's HZB + if should_occlusion_cull_aabb(aabb, instance_id) { #ifdef MESHLET_FIRST_CULLING_PASS - // Frustum culling - // TODO: Faster method from https://vkguide.dev/docs/gpudriven/compute_culling/#frustum-culling-function - for (var i = 0u; i < 6u; i++) { - if dot(view.frustum[i], culling_bounding_sphere_center) + culling_bounding_sphere_radius <= 0.0 { - return; + let id = atomicAdd(&meshlet_meshlet_cull_count_write, 1u); + let value = InstancedOffset(instance_id, instanced_offset.offset); + meshlet_meshlet_cull_queue[constants.rightmost_slot - id] = value; + if ((id & 127u) == 0) { + atomicAdd(&meshlet_meshlet_cull_dispatch.x, 1u); } - } - - // Check LOD cut (cluster group error imperceptible, and parent group error not imperceptible) - let simplification_errors = unpack2x16float(meshlet_simplification_errors[meshlet_id]); - let lod_is_ok = lod_error_is_imperceptible(bounding_spheres.lod_group_sphere, simplification_errors.x, world_from_local, world_scale); - let parent_lod_is_ok = lod_error_is_imperceptible(bounding_spheres.lod_parent_group_sphere, simplification_errors.y, world_from_local, world_scale); - if !lod_is_ok || parent_lod_is_ok { return; } -#endif - - // Project the culling bounding sphere to view-space for occlusion culling -#ifdef MESHLET_FIRST_CULLING_PASS - let previous_world_from_local = affine3_to_square(instance_uniform.previous_world_from_local); - let previous_world_from_local_scale = max(length(previous_world_from_local[0]), max(length(previous_world_from_local[1]), length(previous_world_from_local[2]))); - let occlusion_culling_bounding_sphere_center = previous_world_from_local * vec4(bounding_spheres.culling_sphere.center, 1.0); - let occlusion_culling_bounding_sphere_radius = previous_world_from_local_scale * bounding_spheres.culling_sphere.radius; - let occlusion_culling_bounding_sphere_center_view_space = (previous_view.view_from_world * vec4(occlusion_culling_bounding_sphere_center.xyz, 1.0)).xyz; -#else - let occlusion_culling_bounding_sphere_center = culling_bounding_sphere_center; - let occlusion_culling_bounding_sphere_radius = culling_bounding_sphere_radius; - let occlusion_culling_bounding_sphere_center_view_space = (view.view_from_world * vec4(occlusion_culling_bounding_sphere_center.xyz, 1.0)).xyz; #endif + return; + } - var aabb = project_view_space_sphere_to_screen_space_aabb(occlusion_culling_bounding_sphere_center_view_space, occlusion_culling_bounding_sphere_radius); - let depth_pyramid_size_mip_0 = vec2(textureDimensions(depth_pyramid, 0)); - var aabb_width_pixels = (aabb.z - aabb.x) * depth_pyramid_size_mip_0.x; - var aabb_height_pixels = (aabb.w - aabb.y) * depth_pyramid_size_mip_0.y; - let depth_level = max(0, i32(ceil(log2(max(aabb_width_pixels, aabb_height_pixels))))); // TODO: Naga doesn't like this being a u32 - let depth_pyramid_size = vec2(textureDimensions(depth_pyramid, depth_level)); - let aabb_top_left = vec2(aabb.xy * depth_pyramid_size); - - let depth_quad_a = textureLoad(depth_pyramid, aabb_top_left, depth_level).x; - let depth_quad_b = textureLoad(depth_pyramid, aabb_top_left + vec2(1u, 0u), depth_level).x; - let depth_quad_c = textureLoad(depth_pyramid, aabb_top_left + vec2(0u, 1u), depth_level).x; - let depth_quad_d = textureLoad(depth_pyramid, aabb_top_left + vec2(1u, 1u), depth_level).x; - let occluder_depth = min(min(depth_quad_a, depth_quad_b), min(depth_quad_c, depth_quad_d)); - - // Check whether or not the cluster would be occluded if drawn - var cluster_visible: bool; - if view.clip_from_view[3][3] == 1.0 { - // Orthographic - let sphere_depth = view.clip_from_view[3][2] + (occlusion_culling_bounding_sphere_center_view_space.z + occlusion_culling_bounding_sphere_radius) * view.clip_from_view[2][2]; - cluster_visible = sphere_depth >= occluder_depth; + // If we pass, rasterize the meshlet + // Check how big the cluster is in screen space + let world_from_local = affine3_to_square(meshlet_instance_uniforms[instance_id].world_from_local); + let clip_from_local = view.clip_from_world * world_from_local; + let projection = view.clip_from_world; + var near: f32; + if projection[3][3] == 1.0 { + near = projection[3][2] / projection[2][2]; } else { - // Perspective - let sphere_depth = -view.clip_from_view[3][2] / (occlusion_culling_bounding_sphere_center_view_space.z + occlusion_culling_bounding_sphere_radius); - cluster_visible = sphere_depth >= occluder_depth; + near = projection[3][2]; } - - // Write if the cluster should be occlusion tested in the second pass -#ifdef MESHLET_FIRST_CULLING_PASS - if !cluster_visible { - let bit = 1u << cluster_id % 32u; - atomicOr(&meshlet_second_pass_candidates[cluster_id / 32u], bit); + var screen_aabb = ScreenAabb(vec3(0.0), vec3(0.0)); + var sw_raster = project_aabb(clip_from_local, near, aabb, &screen_aabb); + if sw_raster { + let aabb_size = (screen_aabb.max.xy - screen_aabb.min.xy) * view.viewport.zw; + sw_raster = all(aabb_size <= vec2(64.0)); } -#endif - - // Cluster would be occluded if drawn, so don't setup a draw for it - if !cluster_visible { return; } - - // Check how big the cluster is in screen space -#ifdef MESHLET_FIRST_CULLING_PASS - let culling_bounding_sphere_center_view_space = (view.view_from_world * vec4(culling_bounding_sphere_center.xyz, 1.0)).xyz; - aabb = project_view_space_sphere_to_screen_space_aabb(culling_bounding_sphere_center_view_space, culling_bounding_sphere_radius); - aabb_width_pixels = (aabb.z - aabb.x) * view.viewport.z; - aabb_height_pixels = (aabb.w - aabb.y) * view.viewport.w; -#endif - let cluster_is_small = all(vec2(aabb_width_pixels, aabb_height_pixels) < vec2(64.0)); - - // Let the hardware rasterizer handle near-plane clipping - let not_intersects_near_plane = dot(view.frustum[4u], culling_bounding_sphere_center) > culling_bounding_sphere_radius; var buffer_slot: u32; - if cluster_is_small && not_intersects_near_plane { + if sw_raster { // Append this cluster to the list for software rasterization buffer_slot = atomicAdd(&meshlet_software_raster_indirect_args.x, 1u); + buffer_slot += meshlet_previous_raster_counts[0]; } else { // Append this cluster to the list for hardware rasterization buffer_slot = atomicAdd(&meshlet_hardware_raster_indirect_args.instance_count, 1u); - buffer_slot = constants.meshlet_raster_cluster_rightmost_slot - buffer_slot; - } - meshlet_raster_clusters[buffer_slot] = cluster_id; -} - -// https://github.com/zeux/meshoptimizer/blob/1e48e96c7e8059321de492865165e9ef071bffba/demo/nanite.cpp#L115 -fn lod_error_is_imperceptible(lod_sphere: MeshletBoundingSphere, simplification_error: f32, world_from_local: mat4x4, world_scale: f32) -> bool { - let sphere_world_space = (world_from_local * vec4(lod_sphere.center, 1.0)).xyz; - let radius_world_space = world_scale * lod_sphere.radius; - let error_world_space = world_scale * simplification_error; - - var projected_error = error_world_space; - if view.clip_from_view[3][3] != 1.0 { - // Perspective - let distance_to_closest_point_on_sphere = distance(sphere_world_space, view.world_position) - radius_world_space; - let distance_to_closest_point_on_sphere_clamped_to_znear = max(distance_to_closest_point_on_sphere, view.clip_from_view[3][2]); - projected_error /= distance_to_closest_point_on_sphere_clamped_to_znear; - } - projected_error *= view.clip_from_view[1][1] * 0.5; - projected_error *= view.viewport.w; - - return projected_error < 1.0; -} - -// https://zeux.io/2023/01/12/approximate-projected-bounds -fn project_view_space_sphere_to_screen_space_aabb(cp: vec3, r: f32) -> vec4 { - let inv_width = view.clip_from_view[0][0] * 0.5; - let inv_height = view.clip_from_view[1][1] * 0.5; - if view.clip_from_view[3][3] == 1.0 { - // Orthographic - let min_x = cp.x - r; - let max_x = cp.x + r; - - let min_y = cp.y - r; - let max_y = cp.y + r; - - return vec4(min_x * inv_width, 1.0 - max_y * inv_height, max_x * inv_width, 1.0 - min_y * inv_height); - } else { - // Perspective - let c = vec3(cp.xy, -cp.z); - let cr = c * r; - let czr2 = c.z * c.z - r * r; - - let vx = sqrt(c.x * c.x + czr2); - let min_x = (vx * c.x - cr.z) / (vx * c.z + cr.x); - let max_x = (vx * c.x + cr.z) / (vx * c.z - cr.x); - - let vy = sqrt(c.y * c.y + czr2); - let min_y = (vy * c.y - cr.z) / (vy * c.z + cr.y); - let max_y = (vy * c.y + cr.z) / (vy * c.z - cr.y); - - return vec4(min_x * inv_width, -max_y * inv_height, max_x * inv_width, -min_y * inv_height) + vec4(0.5); + buffer_slot += meshlet_previous_raster_counts[1]; + buffer_slot = constants.rightmost_slot - buffer_slot; } + meshlet_raster_clusters[buffer_slot] = InstancedOffset(instance_id, instanced_offset.offset); } diff --git a/crates/bevy_pbr/src/meshlet/cull_instances.wgsl b/crates/bevy_pbr/src/meshlet/cull_instances.wgsl new file mode 100644 index 0000000000000..5d14d10b6f795 --- /dev/null +++ b/crates/bevy_pbr/src/meshlet/cull_instances.wgsl @@ -0,0 +1,76 @@ +#import bevy_pbr::meshlet_bindings::{ + InstancedOffset, + constants, + meshlet_view_instance_visibility, + meshlet_instance_aabbs, + meshlet_instance_bvh_root_nodes, + meshlet_bvh_cull_count_write, + meshlet_bvh_cull_dispatch, + meshlet_bvh_cull_queue, + meshlet_second_pass_instance_count, + meshlet_second_pass_instance_dispatch, + meshlet_second_pass_instance_candidates, +} +#import bevy_pbr::meshlet_cull_shared::{ + aabb_in_frustum, + should_occlusion_cull_aabb, +} + +fn instance_count() -> u32 { +#ifdef MESHLET_FIRST_CULLING_PASS + return constants.scene_instance_count; +#else + return meshlet_second_pass_instance_count; +#endif +} + +fn map_instance_id(id: u32) -> u32 { +#ifdef MESHLET_FIRST_CULLING_PASS + return id; +#else + return meshlet_second_pass_instance_candidates[id]; +#endif +} + +fn should_cull_instance(instance_id: u32) -> bool { + let bit_offset = instance_id >> 5u; + let packed_visibility = meshlet_view_instance_visibility[instance_id & 31u]; + return bool(extractBits(packed_visibility, bit_offset, 1u)); +} + +@compute +@workgroup_size(128, 1, 1) // 1 instance per thread +fn cull_instances(@builtin(global_invocation_id) global_invocation_id: vec3) { + // Calculate the instance ID for this thread + let dispatch_id = global_invocation_id.x; + if dispatch_id >= instance_count() { return; } + + let instance_id = map_instance_id(dispatch_id); + let aabb = meshlet_instance_aabbs[instance_id]; + + // Visibility and frustum cull, but only in the first pass +#ifdef MESHLET_FIRST_CULLING_PASS + if should_cull_instance(instance_id) || !aabb_in_frustum(aabb, instance_id) { return; } +#endif + + // If we pass, try occlusion culling + // If this instance was occluded, push it to the second pass to check against this frame's HZB + if should_occlusion_cull_aabb(aabb, instance_id) { +#ifdef MESHLET_FIRST_CULLING_PASS + let id = atomicAdd(&meshlet_second_pass_instance_count, 1u); + meshlet_second_pass_instance_candidates[id] = instance_id; + if ((id & 127u) == 0u) { + atomicAdd(&meshlet_second_pass_instance_dispatch.x, 1u); + } +#endif + return; + } + + // If we pass, push the instance's root node to BVH cull + let root_node = meshlet_instance_bvh_root_nodes[instance_id]; + let id = atomicAdd(&meshlet_bvh_cull_count_write, 1u); + meshlet_bvh_cull_queue[id] = InstancedOffset(instance_id, root_node); + if ((id & 15u) == 0u) { + atomicAdd(&meshlet_bvh_cull_dispatch.x, 1u); + } +} diff --git a/crates/bevy_pbr/src/meshlet/fill_cluster_buffers.wgsl b/crates/bevy_pbr/src/meshlet/fill_cluster_buffers.wgsl deleted file mode 100644 index db39ae2bcedb9..0000000000000 --- a/crates/bevy_pbr/src/meshlet/fill_cluster_buffers.wgsl +++ /dev/null @@ -1,50 +0,0 @@ -#import bevy_pbr::meshlet_bindings::{ - scene_instance_count, - meshlet_global_cluster_count, - meshlet_instance_meshlet_counts, - meshlet_instance_meshlet_slice_starts, - meshlet_cluster_instance_ids, - meshlet_cluster_meshlet_ids, -} - -/// Writes out instance_id and meshlet_id to the global buffers for each cluster in the scene. - -var cluster_slice_start_workgroup: u32; - -@compute -@workgroup_size(1024, 1, 1) // 1024 threads per workgroup, 1 instance per workgroup -fn fill_cluster_buffers( - @builtin(workgroup_id) workgroup_id: vec3, - @builtin(num_workgroups) num_workgroups: vec3, - @builtin(local_invocation_index) local_invocation_index: u32, -) { - // Calculate the instance ID for this workgroup - var instance_id = workgroup_id.x + (workgroup_id.y * num_workgroups.x); - if instance_id >= scene_instance_count { return; } - - let instance_meshlet_count = meshlet_instance_meshlet_counts[instance_id]; - let instance_meshlet_slice_start = meshlet_instance_meshlet_slice_starts[instance_id]; - - // Reserve cluster slots for the instance and broadcast to the workgroup - if local_invocation_index == 0u { - cluster_slice_start_workgroup = atomicAdd(&meshlet_global_cluster_count, instance_meshlet_count); - } - let cluster_slice_start = workgroupUniformLoad(&cluster_slice_start_workgroup); - - // Loop enough times to write out all the meshlets for the instance given that each thread writes 1 meshlet in each iteration - for (var clusters_written = 0u; clusters_written < instance_meshlet_count; clusters_written += 1024u) { - // Calculate meshlet ID within this instance's MeshletMesh to process for this thread - let meshlet_id_local = clusters_written + local_invocation_index; - if meshlet_id_local >= instance_meshlet_count { return; } - - // Find the overall cluster ID in the global cluster buffer - let cluster_id = cluster_slice_start + meshlet_id_local; - - // Find the overall meshlet ID in the global meshlet buffer - let meshlet_id = instance_meshlet_slice_start + meshlet_id_local; - - // Write results to buffers - meshlet_cluster_instance_ids[cluster_id] = instance_id; - meshlet_cluster_meshlet_ids[cluster_id] = meshlet_id; - } -} diff --git a/crates/bevy_pbr/src/meshlet/fill_counts.wgsl b/crates/bevy_pbr/src/meshlet/fill_counts.wgsl new file mode 100644 index 0000000000000..f319e395d93c4 --- /dev/null +++ b/crates/bevy_pbr/src/meshlet/fill_counts.wgsl @@ -0,0 +1,35 @@ +/// Copies the counts of meshlets in the hardware and software buckets, resetting the counters in the process. + +struct DispatchIndirectArgs { + x: u32, + y: u32, + z: u32, +} + +struct DrawIndirectArgs { + vertex_count: u32, + instance_count: u32, + first_vertex: u32, + first_instance: u32, +} + +@group(0) @binding(0) var meshlet_software_raster_indirect_args: DispatchIndirectArgs; +@group(0) @binding(1) var meshlet_hardware_raster_indirect_args: DrawIndirectArgs; +@group(0) @binding(2) var meshlet_previous_raster_counts: array; +#ifdef MESHLET_2D_DISPATCH +@group(0) @binding(3) var meshlet_software_raster_cluster_count: u32; +#endif + +@compute +@workgroup_size(1, 1, 1) +fn fill_counts() { +#ifdef MESHLET_2D_DISPATCH + meshlet_previous_raster_counts[0] += meshlet_software_raster_cluster_count; +#else + meshlet_previous_raster_counts[0] += meshlet_software_raster_indirect_args.x; +#endif + meshlet_software_raster_indirect_args.x = 0; + + meshlet_previous_raster_counts[1] += meshlet_hardware_raster_indirect_args.instance_count; + meshlet_hardware_raster_indirect_args.instance_count = 0; +} diff --git a/crates/bevy_pbr/src/meshlet/from_mesh.rs b/crates/bevy_pbr/src/meshlet/from_mesh.rs index ed2be52f538dd..0b478a4461723 100644 --- a/crates/bevy_pbr/src/meshlet/from_mesh.rs +++ b/crates/bevy_pbr/src/meshlet/from_mesh.rs @@ -1,16 +1,20 @@ -use super::asset::{ - Meshlet, MeshletBoundingSphere, MeshletBoundingSpheres, MeshletMesh, MeshletSimplificationError, -}; +use crate::meshlet::asset::{MeshletAabb, MeshletAabbErrorOffset, MeshletCullData}; + +use super::asset::{BvhNode, Meshlet, MeshletBoundingSphere, MeshletMesh}; use alloc::borrow::Cow; -use bevy_math::{ops::log2, IVec3, Vec2, Vec3, Vec3Swizzles}; +use bevy_math::{ + bounding::{Aabb3d, BoundingSphere, BoundingVolume}, + ops::log2, + IVec3, Isometry3d, Vec2, Vec3, Vec3A, Vec3Swizzles, +}; use bevy_platform::collections::HashMap; use bevy_render::{ mesh::{Indices, Mesh}, render_resource::PrimitiveTopology, }; +use bevy_tasks::{AsyncComputeTaskPool, ParallelSlice}; use bitvec::{order::Lsb0, vec::BitVec, view::BitView}; -use core::{iter, ops::Range}; -use half::f16; +use core::{f32, ops::Range}; use itertools::Itertools; use meshopt::{ build_meshlets, ffi::meshopt_Meshlet, generate_vertex_remap_multi, @@ -19,11 +23,13 @@ use meshopt::{ use metis::{option::Opt, Graph}; use smallvec::SmallVec; use thiserror::Error; +use tracing::debug_span; // Aim to have 8 meshlets per group const TARGET_MESHLETS_PER_GROUP: usize = 8; -// Reject groups that keep over 95% of their original triangles -const SIMPLIFICATION_FAILURE_PERCENTAGE: f32 = 0.95; +// Reject groups that keep over 60% of their original triangles. We'd much rather render a few +// extra triangles than create too many meshlets, increasing cull overhead. +const SIMPLIFICATION_FAILURE_PERCENTAGE: f32 = 0.60; /// Default vertex position quantization factor for use with [`MeshletMesh::from_mesh`]. /// @@ -64,6 +70,9 @@ impl MeshletMesh { mesh: &Mesh, vertex_position_quantization_factor: u8, ) -> Result { + let s = debug_span!("build meshlet mesh"); + let _e = s.enter(); + // Validate mesh format let indices = validate_input_mesh(mesh)?; @@ -84,41 +93,28 @@ impl MeshletMesh { ); // Split the mesh into an initial list of meshlets (LOD 0) - let mut meshlets = compute_meshlets( + let (mut meshlets, mut cull_data) = compute_meshlets( &indices, &vertices, &position_only_vertex_remap, position_only_vertex_count, + None, ); - let mut bounding_spheres = meshlets - .iter() - .map(|meshlet| compute_meshlet_bounds(meshlet, &vertices)) - .map(|bounding_sphere| MeshletBoundingSpheres { - culling_sphere: bounding_sphere, - lod_group_sphere: bounding_sphere, - lod_parent_group_sphere: MeshletBoundingSphere { - center: Vec3::ZERO, - radius: 0.0, - }, - }) - .collect::>(); - let mut simplification_errors = iter::repeat_n( - MeshletSimplificationError { - group_error: f16::ZERO, - parent_group_error: f16::MAX, - }, - meshlets.len(), - ) - .collect::>(); let mut vertex_locks = vec![false; vertices.vertex_count]; // Build further LODs - let mut simplification_queue = 0..meshlets.len(); - while simplification_queue.len() > 1 { + let mut bvh = BvhBuilder::default(); + let mut all_groups = Vec::new(); + let mut simplification_queue: Vec<_> = (0..meshlets.len() as u32).collect(); + let mut stuck = Vec::new(); + while !simplification_queue.is_empty() { + let s = debug_span!("simplify lod", meshlets = simplification_queue.len()); + let _e = s.enter(); + // For each meshlet build a list of connected meshlets (meshlets that share a vertex) let connected_meshlets_per_meshlet = find_connected_meshlets( - simplification_queue.clone(), + &simplification_queue, &meshlets, &position_only_vertex_remap, position_only_vertex_count, @@ -127,9 +123,11 @@ impl MeshletMesh { // Group meshlets into roughly groups of size TARGET_MESHLETS_PER_GROUP, // grouping meshlets with a high number of shared vertices let groups = group_meshlets( + &simplification_queue, + &cull_data, &connected_meshlets_per_meshlet, - simplification_queue.clone(), ); + simplification_queue.clear(); // Lock borders between groups to prevent cracks when simplifying lock_group_borders( @@ -140,16 +138,20 @@ impl MeshletMesh { position_only_vertex_count, ); - let next_lod_start = meshlets.len(); - for group_meshlets in groups.into_iter() { + let simplified = groups.par_chunk_map(AsyncComputeTaskPool::get(), 1, |_, groups| { + let mut group = groups[0].clone(); + // If the group only has a single meshlet we can't simplify it - if group_meshlets.len() == 1 { - continue; + if group.meshlets.len() == 1 { + return Err(group); } + let s = debug_span!("simplify group", meshlets = group.meshlets.len()); + let _e = s.enter(); + // Simplify the group to ~50% triangle count let Some((simplified_group_indices, mut group_error)) = simplify_meshlet_group( - &group_meshlets, + &group, &meshlets, &vertices, vertex_normals, @@ -157,51 +159,70 @@ impl MeshletMesh { &vertex_locks, ) else { // Couldn't simplify the group enough - continue; + return Err(group); }; - // Compute LOD data for the group - let group_bounding_sphere = compute_lod_group_data( - &group_meshlets, - &mut group_error, - &mut bounding_spheres, - &mut simplification_errors, - ); + // Force the group error to be atleast as large as all of its constituent meshlet's + // individual errors. + for &id in group.meshlets.iter() { + group_error = group_error.max(cull_data[id as usize].error); + } + group.parent_error = group_error; // Build new meshlets using the simplified group - let new_meshlets_count = split_simplified_group_into_new_meshlets( + let new_meshlets = compute_meshlets( &simplified_group_indices, &vertices, &position_only_vertex_remap, position_only_vertex_count, - &mut meshlets, + Some((group.lod_bounds, group.parent_error)), ); - // Calculate the culling bounding sphere for the new meshlets and set their LOD group data - let new_meshlet_ids = (meshlets.len() - new_meshlets_count)..meshlets.len(); - bounding_spheres.extend(new_meshlet_ids.clone().map(|meshlet_id| { - MeshletBoundingSpheres { - culling_sphere: compute_meshlet_bounds(meshlets.get(meshlet_id), &vertices), - lod_group_sphere: group_bounding_sphere, - lod_parent_group_sphere: MeshletBoundingSphere { - center: Vec3::ZERO, - radius: 0.0, - }, + Ok((group, new_meshlets)) + }); + + let first_group = all_groups.len() as u32; + let mut passed_tris = 0; + let mut stuck_tris = 0; + for group in simplified { + match group { + Ok((group, (new_meshlets, new_cull_data))) => { + let start = meshlets.len(); + merge_meshlets(&mut meshlets, new_meshlets); + cull_data.extend(new_cull_data); + let end = meshlets.len(); + let new_meshlet_ids = start as u32..end as u32; + + passed_tris += triangles_in_meshlets(&meshlets, new_meshlet_ids.clone()); + simplification_queue.extend(new_meshlet_ids); + all_groups.push(group); } - })); - simplification_errors.extend(iter::repeat_n( - MeshletSimplificationError { - group_error, - parent_group_error: f16::MAX, - }, - new_meshlet_ids.len(), - )); + Err(group) => { + stuck_tris += + triangles_in_meshlets(&meshlets, group.meshlets.iter().copied()); + stuck.push(group); + } + } + } + + // If we have enough triangles that passed, we can retry simplifying the stuck + // meshlets. + if passed_tris > stuck_tris / 3 { + simplification_queue.extend(stuck.drain(..).flat_map(|group| group.meshlets)); } - // Set simplification queue to the list of newly created meshlets - simplification_queue = next_lod_start..meshlets.len(); + bvh.add_lod(first_group, &all_groups); } + // If there's any stuck meshlets left, add another LOD level with only them + if !stuck.is_empty() { + let first_group = all_groups.len() as u32; + all_groups.extend(stuck); + bvh.add_lod(first_group, &all_groups); + } + + let (bvh, aabb, depth) = bvh.build(&mut meshlets, all_groups, &mut cull_data); + // Copy vertex attributes per meshlet and compress let mut vertex_positions = BitVec::::new(); let mut vertex_normals = Vec::new(); @@ -227,9 +248,17 @@ impl MeshletMesh { vertex_normals: vertex_normals.into(), vertex_uvs: vertex_uvs.into(), indices: meshlets.triangles.into(), + bvh: bvh.into(), meshlets: bevy_meshlets.into(), - meshlet_bounding_spheres: bounding_spheres.into(), - meshlet_simplification_errors: simplification_errors.into(), + meshlet_cull_data: cull_data + .into_iter() + .map(|cull_data| MeshletCullData { + aabb: aabb_to_meshlet(cull_data.aabb, cull_data.error, 0), + lod_group_sphere: sphere_to_meshlet(cull_data.lod_group_sphere), + }) + .collect(), + aabb, + bvh_depth: depth, }) } } @@ -254,12 +283,19 @@ fn validate_input_mesh(mesh: &Mesh) -> Result, MeshToMeshletMeshC } } +fn triangles_in_meshlets(meshlets: &Meshlets, ids: impl IntoIterator) -> u32 { + ids.into_iter() + .map(|id| meshlets.get(id as _).triangles.len() as u32 / 3) + .sum() +} + fn compute_meshlets( indices: &[u32], vertices: &VertexDataAdapter, position_only_vertex_remap: &[u32], position_only_vertex_count: usize, -) -> Meshlets { + prev_lod_data: Option<(BoundingSphere, f32)>, +) -> (Meshlets, Vec) { // For each vertex, build a list of all triangles that use it let mut vertices_to_triangles = vec![Vec::new(); position_only_vertex_count]; for (i, index) in indices.iter().enumerate() { @@ -293,6 +329,7 @@ fn compute_meshlets( } // The order of triangles depends on hash traversal order; to produce deterministic results, sort them + // TODO: Wouldn't it be faster to use a `BTreeMap` above instead of `HashMap` + sorting? for list in connected_triangles_per_triangle.iter_mut() { list.sort_unstable(); } @@ -336,40 +373,52 @@ fn compute_meshlets( vertices: Vec::new(), triangles: Vec::new(), }; + let mut cull_data = Vec::new(); + let get_vertex = |&v: &u32| { + *bytemuck::from_bytes::( + &vertices.reader.get_ref() + [vertices.position_offset + v as usize * vertices.vertex_stride..][..12], + ) + }; for meshlet_indices in &indices_per_meshlet { let meshlet = build_meshlets(meshlet_indices, vertices, 255, 128, 0.0); - let vertex_offset = meshlets.vertices.len() as u32; - let triangle_offset = meshlets.triangles.len() as u32; - meshlets.vertices.extend_from_slice(&meshlet.vertices); - meshlets.triangles.extend_from_slice(&meshlet.triangles); - meshlets - .meshlets - .extend(meshlet.meshlets.into_iter().map(|mut meshlet| { - meshlet.vertex_offset += vertex_offset; - meshlet.triangle_offset += triangle_offset; - meshlet - })); + for meshlet in meshlet.iter() { + let (lod_group_sphere, error) = prev_lod_data.unwrap_or_else(|| { + let bounds = meshopt::compute_meshlet_bounds(meshlet, vertices); + (BoundingSphere::new(bounds.center, bounds.radius), 0.0) + }); + + cull_data.push(TempMeshletCullData { + aabb: Aabb3d::from_point_cloud( + Isometry3d::IDENTITY, + meshlet.vertices.iter().map(get_vertex), + ), + lod_group_sphere, + error, + }); + } + merge_meshlets(&mut meshlets, meshlet); } - meshlets + (meshlets, cull_data) } fn find_connected_meshlets( - simplification_queue: Range, + simplification_queue: &[u32], meshlets: &Meshlets, position_only_vertex_remap: &[u32], position_only_vertex_count: usize, ) -> Vec> { // For each vertex, build a list of all meshlets that use it let mut vertices_to_meshlets = vec![Vec::new(); position_only_vertex_count]; - for meshlet_id in simplification_queue.clone() { - let meshlet = meshlets.get(meshlet_id); + for (id_index, &meshlet_id) in simplification_queue.iter().enumerate() { + let meshlet = meshlets.get(meshlet_id as _); for index in meshlet.triangles { let vertex_id = position_only_vertex_remap[meshlet.vertices[*index as usize] as usize]; let vertex_to_meshlets = &mut vertices_to_meshlets[vertex_id as usize]; // Meshlets are added in order, so we can just check the last element to deduplicate, // in the case of two triangles sharing the same vertex within a single meshlet - if vertex_to_meshlets.last() != Some(&meshlet_id) { - vertex_to_meshlets.push(meshlet_id); + if vertex_to_meshlets.last() != Some(&id_index) { + vertex_to_meshlets.push(id_index); } } } @@ -389,13 +438,12 @@ fn find_connected_meshlets( let mut connected_meshlets_per_meshlet = vec![Vec::new(); simplification_queue.len()]; for ((meshlet_id1, meshlet_id2), shared_vertex_count) in meshlet_pair_to_shared_vertex_count { // We record both id1->id2 and id2->id1 as adjacency is symmetrical - connected_meshlets_per_meshlet[meshlet_id1 - simplification_queue.start] - .push((meshlet_id2, shared_vertex_count)); - connected_meshlets_per_meshlet[meshlet_id2 - simplification_queue.start] - .push((meshlet_id1, shared_vertex_count)); + connected_meshlets_per_meshlet[meshlet_id1].push((meshlet_id2, shared_vertex_count)); + connected_meshlets_per_meshlet[meshlet_id2].push((meshlet_id1, shared_vertex_count)); } // The order of meshlets depends on hash traversal order; to produce deterministic results, sort them + // TODO: Wouldn't it be faster to use a `BTreeMap` above instead of `HashMap` + sorting? for list in connected_meshlets_per_meshlet.iter_mut() { list.sort_unstable(); } @@ -405,16 +453,17 @@ fn find_connected_meshlets( // METIS manual: https://github.com/KarypisLab/METIS/blob/e0f1b88b8efcb24ffa0ec55eabb78fbe61e58ae7/manual/manual.pdf fn group_meshlets( + simplification_queue: &[u32], + meshlet_cull_data: &[TempMeshletCullData], connected_meshlets_per_meshlet: &[Vec<(usize, usize)>], - simplification_queue: Range, -) -> Vec> { +) -> Vec { let mut xadj = Vec::with_capacity(simplification_queue.len() + 1); let mut adjncy = Vec::new(); let mut adjwgt = Vec::new(); for connected_meshlets in connected_meshlets_per_meshlet { xadj.push(adjncy.len() as i32); for (connected_meshlet_id, shared_vertex_count) in connected_meshlets { - adjncy.push((connected_meshlet_id - simplification_queue.start) as i32); + adjncy.push(*connected_meshlet_id as i32); adjwgt.push(*shared_vertex_count as i32); // TODO: Additional weight based on meshlet spatial proximity } @@ -436,16 +485,22 @@ fn group_meshlets( .part_recursive(&mut group_per_meshlet) .unwrap(); - let mut groups = vec![SmallVec::new(); partition_count]; + let mut groups = vec![TempMeshletGroup::default(); partition_count]; for (i, meshlet_group) in group_per_meshlet.into_iter().enumerate() { - groups[meshlet_group as usize].push(i + simplification_queue.start); + let group = &mut groups[meshlet_group as usize]; + let meshlet_id = simplification_queue[i]; + + group.meshlets.push(meshlet_id); + let data = &meshlet_cull_data[meshlet_id as usize]; + group.aabb = group.aabb.merge(&data.aabb); + group.lod_bounds = merge_spheres(group.lod_bounds, data.lod_group_sphere); } groups } fn lock_group_borders( vertex_locks: &mut [bool], - groups: &[SmallVec<[usize; TARGET_MESHLETS_PER_GROUP]>], + groups: &[TempMeshletGroup], meshlets: &Meshlets, position_only_vertex_remap: &[u32], position_only_vertex_count: usize, @@ -453,9 +508,9 @@ fn lock_group_borders( let mut position_only_locks = vec![-1; position_only_vertex_count]; // Iterate over position-only based vertices of all meshlets in all groups - for (group_id, group_meshlets) in groups.iter().enumerate() { - for meshlet_id in group_meshlets { - let meshlet = meshlets.get(*meshlet_id); + for (group_id, group) in groups.iter().enumerate() { + for &meshlet_id in group.meshlets.iter() { + let meshlet = meshlets.get(meshlet_id as usize); for index in meshlet.triangles { let vertex_id = position_only_vertex_remap[meshlet.vertices[*index as usize] as usize] as usize; @@ -480,21 +535,25 @@ fn lock_group_borders( } fn simplify_meshlet_group( - group_meshlets: &[usize], + group: &TempMeshletGroup, meshlets: &Meshlets, vertices: &VertexDataAdapter<'_>, vertex_normals: &[f32], vertex_stride: usize, vertex_locks: &[bool], -) -> Option<(Vec, f16)> { +) -> Option<(Vec, f32)> { // Build a new index buffer into the mesh vertex data by combining all meshlet data in the group - let mut group_indices = Vec::new(); - for meshlet_id in group_meshlets { - let meshlet = meshlets.get(*meshlet_id); - for meshlet_index in meshlet.triangles { - group_indices.push(meshlet.vertices[*meshlet_index as usize]); - } - } + let group_indices = group + .meshlets + .iter() + .flat_map(|&meshlet_id| { + let meshlet = meshlets.get(meshlet_id as _); + meshlet + .triangles + .iter() + .map(|&meshlet_index| meshlet.vertices[meshlet_index as usize]) + }) + .collect::>(); // Simplify the group to ~50% triangle count let mut error = 0.0; @@ -511,96 +570,28 @@ fn simplify_meshlet_group( Some(&mut error), ); - // Check if we were able to simplify at least a little + // Check if we were able to simplify if simplified_group_indices.len() as f32 / group_indices.len() as f32 > SIMPLIFICATION_FAILURE_PERCENTAGE { return None; } - Some((simplified_group_indices, f16::from_f32(error))) + Some((simplified_group_indices, error)) } -fn compute_lod_group_data( - group_meshlets: &[usize], - group_error: &mut f16, - bounding_spheres: &mut [MeshletBoundingSpheres], - simplification_errors: &mut [MeshletSimplificationError], -) -> MeshletBoundingSphere { - let mut group_bounding_sphere = MeshletBoundingSphere { - center: Vec3::ZERO, - radius: 0.0, - }; - - // Compute the lod group sphere center as a weighted average of the children spheres - let mut weight = 0.0; - for meshlet_id in group_meshlets { - let meshlet_lod_bounding_sphere = bounding_spheres[*meshlet_id].lod_group_sphere; - group_bounding_sphere.center += - meshlet_lod_bounding_sphere.center * meshlet_lod_bounding_sphere.radius; - weight += meshlet_lod_bounding_sphere.radius; - } - group_bounding_sphere.center /= weight; - - // Force parent group sphere to contain all child group spheres (we're currently building the parent from its children) - // TODO: This does not produce the absolute minimal bounding sphere. Doing so is non-trivial. - // "Smallest enclosing balls of balls" http://www.inf.ethz.ch/personal/emo/DoctThesisFiles/fischer05.pdf - for meshlet_id in group_meshlets { - let meshlet_lod_bounding_sphere = bounding_spheres[*meshlet_id].lod_group_sphere; - let d = meshlet_lod_bounding_sphere - .center - .distance(group_bounding_sphere.center); - group_bounding_sphere.radius = group_bounding_sphere - .radius - .max(meshlet_lod_bounding_sphere.radius + d); - } - - // Force parent error to be >= child error (we're currently building the parent from its children) - for meshlet_id in group_meshlets { - *group_error = group_error.max(simplification_errors[*meshlet_id].group_error); - } - - // Set the children's lod parent group data to the new lod group we just made - for meshlet_id in group_meshlets { - bounding_spheres[*meshlet_id].lod_parent_group_sphere = group_bounding_sphere; - simplification_errors[*meshlet_id].parent_group_error = *group_error; - } - - group_bounding_sphere -} - -fn split_simplified_group_into_new_meshlets( - simplified_group_indices: &[u32], - vertices: &VertexDataAdapter<'_>, - position_only_vertex_remap: &[u32], - position_only_vertex_count: usize, - meshlets: &mut Meshlets, -) -> usize { - let simplified_meshlets = compute_meshlets( - simplified_group_indices, - vertices, - position_only_vertex_remap, - position_only_vertex_count, - ); - let new_meshlets_count = simplified_meshlets.len(); - +fn merge_meshlets(meshlets: &mut Meshlets, merge: Meshlets) { let vertex_offset = meshlets.vertices.len() as u32; let triangle_offset = meshlets.triangles.len() as u32; - meshlets - .vertices - .extend_from_slice(&simplified_meshlets.vertices); - meshlets - .triangles - .extend_from_slice(&simplified_meshlets.triangles); + meshlets.vertices.extend_from_slice(&merge.vertices); + meshlets.triangles.extend_from_slice(&merge.triangles); meshlets .meshlets - .extend(simplified_meshlets.meshlets.into_iter().map(|mut meshlet| { + .extend(merge.meshlets.into_iter().map(|mut meshlet| { meshlet.vertex_offset += vertex_offset; meshlet.triangle_offset += triangle_offset; meshlet })); - - new_meshlets_count } fn build_and_compress_per_meshlet_vertex_data( @@ -688,14 +679,397 @@ fn build_and_compress_per_meshlet_vertex_data( }); } -fn compute_meshlet_bounds( - meshlet: meshopt::Meshlet<'_>, - vertices: &VertexDataAdapter<'_>, -) -> MeshletBoundingSphere { - let bounds = meshopt::compute_meshlet_bounds(meshlet, vertices); +fn merge_spheres(a: BoundingSphere, b: BoundingSphere) -> BoundingSphere { + let sr = a.radius().min(b.radius()); + let br = a.radius().max(b.radius()); + let len = a.center.distance(b.center); + if len + sr <= br || sr == 0.0 || len == 0.0 { + if a.radius() > b.radius() { + a + } else { + b + } + } else { + let radius = (sr + br + len) / 2.0; + let center = + (a.center + b.center + (a.radius() - b.radius()) * (a.center - b.center) / len) / 2.0; + BoundingSphere::new(center, radius) + } +} + +#[derive(Copy, Clone)] +struct TempMeshletCullData { + aabb: Aabb3d, + lod_group_sphere: BoundingSphere, + error: f32, +} + +#[derive(Clone)] +struct TempMeshletGroup { + aabb: Aabb3d, + lod_bounds: BoundingSphere, + parent_error: f32, + meshlets: SmallVec<[u32; TARGET_MESHLETS_PER_GROUP]>, +} + +impl Default for TempMeshletGroup { + fn default() -> Self { + Self { + aabb: aabb_default(), // Default AABB to merge into + lod_bounds: BoundingSphere::new(Vec3A::ZERO, 0.0), + parent_error: f32::MAX, + meshlets: SmallVec::new(), + } + } +} + +// All the BVH build code was stolen from https://github.com/SparkyPotato/radiance/blob/4aa17a3a5be7a0466dc69713e249bbcee9f46057/crates/rad-renderer/src/assets/mesh/virtual_mesh.rs because it works and I'm lazy and don't want to reimplement it +struct TempBvhNode { + group: u32, + aabb: Aabb3d, + children: SmallVec<[u32; 8]>, +} + +#[derive(Default)] +struct BvhBuilder { + nodes: Vec, + lods: Vec>, +} + +impl BvhBuilder { + fn add_lod(&mut self, offset: u32, all_groups: &[TempMeshletGroup]) { + let first = self.nodes.len() as u32; + self.nodes.extend( + all_groups + .iter() + .enumerate() + .skip(offset as _) + .map(|(i, group)| TempBvhNode { + group: i as u32, + aabb: group.aabb, + children: SmallVec::new(), + }), + ); + let end = self.nodes.len() as u32; + if first != end { + self.lods.push(first..end); + } + } + + fn surface_area(&self, nodes: &[u32]) -> f32 { + nodes + .iter() + .map(|&x| self.nodes[x as usize].aabb) + .reduce(|a, b| a.merge(&b)) + .expect("cannot find surface area of zero nodes") + .visible_area() + } + + fn sort_nodes_by_sah(&self, nodes: &mut [u32], splits: [usize; 8]) { + // We use a BVH8, so just recursively binary split 3 times for near-optimal SAH + for i in 0..3 { + let parts = 1 << i; // 2^i + let nodes_per_split = 8 >> i; // 8 / 2^i + let half_count = nodes_per_split / 2; + let mut offset = 0; + for p in 0..parts { + let first = p * nodes_per_split; + let mut s0 = 0; + let mut s1 = 0; + for i in 0..half_count { + s0 += splits[first + i]; + s1 += splits[first + half_count + i]; + } + let c = s0 + s1; + let nodes = &mut nodes[offset..(offset + c)]; + offset += c; + + let mut cost = f32::MAX; + let mut axis = 0; + let key = |x, ax| self.nodes[x as usize].aabb.center()[ax]; + for ax in 0..3 { + nodes.sort_unstable_by(|&x, &y| key(x, ax).partial_cmp(&key(y, ax)).unwrap()); + let (left, right) = nodes.split_at(s0); + let c = self.surface_area(left) + self.surface_area(right); + if c < cost { + axis = ax; + cost = c; + } + } + if axis != 2 { + nodes.sort_unstable_by(|&x, &y| { + key(x, axis).partial_cmp(&key(y, axis)).unwrap() + }); + } + } + } + } + + fn build_temp_inner(&mut self, nodes: &mut [u32], optimize: bool) -> u32 { + let count = nodes.len(); + if count == 1 { + nodes[0] + } else if count <= 8 { + let i = self.nodes.len(); + self.nodes.push(TempBvhNode { + group: u32::MAX, + aabb: aabb_default(), + children: nodes.iter().copied().collect(), + }); + i as _ + } else { + // We need to split the nodes into 8 groups, with the smallest possible tree depth. + // Additionally, no child should be more than one level deeper than the others. + // At `l` levels, we can fit upto 8^l nodes. + // The `max_child_size` is the largest power of 8 <= `count` (any larger and we'd have + // unfilled nodes). + // The `min_child_size` is thus 1 level (8 times) smaller. + // After distributing `min_child_size` to all children, we have distributed + // `min_child_size * 8` nodes (== `max_child_size`). + // The remaining nodes are then distributed left to right. + let max_child_size = 1 << ((count.ilog2() / 3) * 3); + let min_child_size = max_child_size >> 3; + let max_extra_per_node = max_child_size - min_child_size; + let mut extra = count - max_child_size; // 8 * min_child_size + let splits = core::array::from_fn(|_| { + let size = extra.min(max_extra_per_node); + extra -= size; + min_child_size + size + }); + + if optimize { + self.sort_nodes_by_sah(nodes, splits); + } + + let mut offset = 0; + let children = splits + .into_iter() + .map(|size| { + let i = self.build_temp_inner(&mut nodes[offset..(offset + size)], optimize); + offset += size; + i + }) + .collect(); + + let i = self.nodes.len(); + self.nodes.push(TempBvhNode { + group: u32::MAX, + aabb: aabb_default(), + children, + }); + i as _ + } + } + + fn build_temp(&mut self) -> u32 { + let mut lods = Vec::with_capacity(self.lods.len()); + for lod in core::mem::take(&mut self.lods) { + let mut lod: Vec<_> = lod.collect(); + let root = self.build_temp_inner(&mut lod, true); + let node = &self.nodes[root as usize]; + if node.group != u32::MAX || node.children.len() == 8 { + lods.push(root); + } else { + lods.extend(node.children.iter().copied()); + } + } + self.build_temp_inner(&mut lods, false) + } + + fn build_inner( + &self, + groups: &[TempMeshletGroup], + out: &mut Vec, + max_depth: &mut u32, + node: u32, + depth: u32, + ) -> u32 { + *max_depth = depth.max(*max_depth); + let node = &self.nodes[node as usize]; + let onode = out.len(); + out.push(BvhNode::default()); + + for (i, &child_id) in node.children.iter().enumerate() { + let child = &self.nodes[child_id as usize]; + if child.group != u32::MAX { + let group = &groups[child.group as usize]; + let out = &mut out[onode]; + out.aabbs[i] = aabb_to_meshlet(group.aabb, group.parent_error, group.meshlets[0]); + out.lod_bounds[i] = sphere_to_meshlet(group.lod_bounds); + out.child_counts[i] = group.meshlets[1] as _; + } else { + let child_id = self.build_inner(groups, out, max_depth, child_id, depth + 1); + let child = &out[child_id as usize]; + let mut aabb = aabb_default(); + let mut parent_error = 0.0f32; + let mut lod_bounds = BoundingSphere::new(Vec3A::ZERO, 0.0); + for i in 0..8 { + if child.child_counts[i] == 0 { + break; + } + + aabb = aabb.merge(&Aabb3d::new( + child.aabbs[i].center, + child.aabbs[i].half_extent, + )); + lod_bounds = merge_spheres( + lod_bounds, + BoundingSphere::new(child.lod_bounds[i].center, child.lod_bounds[i].radius), + ); + parent_error = parent_error.max(child.aabbs[i].error); + } + + let out = &mut out[onode]; + out.aabbs[i] = aabb_to_meshlet(aabb, parent_error, child_id); + out.lod_bounds[i] = sphere_to_meshlet(lod_bounds); + out.child_counts[i] = u8::MAX; + } + } + + onode as _ + } + + fn build( + mut self, + meshlets: &mut Meshlets, + mut groups: Vec, + cull_data: &mut Vec, + ) -> (Vec, MeshletAabb, u32) { + // The BVH requires group meshlets to be contiguous, so remap them first. + let mut remap = Vec::with_capacity(meshlets.meshlets.len()); + let mut remapped_cull_data = Vec::with_capacity(cull_data.len()); + for group in groups.iter_mut() { + let first = remap.len() as u32; + let count = group.meshlets.len() as u32; + remap.extend( + group + .meshlets + .iter() + .map(|&m| meshlets.meshlets[m as usize]), + ); + remapped_cull_data.extend(group.meshlets.iter().map(|&m| cull_data[m as usize])); + group.meshlets.resize(2, 0); + group.meshlets[0] = first; + group.meshlets[1] = count; + } + meshlets.meshlets = remap; + *cull_data = remapped_cull_data; + + let mut out = vec![]; + let mut aabb = aabb_default(); + let mut max_depth = 0; + + if self.nodes.len() == 1 { + let mut o = BvhNode::default(); + let group = &groups[0]; + o.aabbs[0] = aabb_to_meshlet(group.aabb, group.parent_error, group.meshlets[0]); + o.lod_bounds[0] = sphere_to_meshlet(group.lod_bounds); + o.child_counts[0] = group.meshlets[1] as _; + out.push(o); + aabb = group.aabb; + max_depth = 1; + } else { + let root = self.build_temp(); + let root = self.build_inner(&groups, &mut out, &mut max_depth, root, 1); + assert_eq!(root, 0, "root must be 0"); + + let root = &out[0]; + for i in 0..8 { + if root.child_counts[i] == 0 { + break; + } + + aabb = aabb.merge(&Aabb3d::new( + root.aabbs[i].center, + root.aabbs[i].half_extent, + )); + } + } + + let mut reachable = vec![false; meshlets.meshlets.len()]; + verify_bvh(&out, cull_data, &mut reachable, 0); + assert!( + reachable.iter().all(|&x| x), + "all meshlets must be reachable" + ); + + ( + out, + MeshletAabb { + center: aabb.center().into(), + half_extent: aabb.half_size().into(), + }, + max_depth, + ) + } +} + +fn verify_bvh( + out: &[BvhNode], + cull_data: &[TempMeshletCullData], + reachable: &mut [bool], + node: u32, +) { + let node = &out[node as usize]; + for i in 0..8 { + let sphere = node.lod_bounds[i]; + let error = node.aabbs[i].error; + if node.child_counts[i] == u8::MAX { + let child = &out[node.aabbs[i].child_offset as usize]; + for i in 0..8 { + if child.child_counts[i] == 0 { + break; + } + assert!( + child.aabbs[i].error <= error, + "BVH errors are not monotonic" + ); + let sphere_error = (sphere.center - child.lod_bounds[i].center).length() + - (sphere.radius - child.lod_bounds[i].radius); + assert!( + sphere_error <= 0.0001, + "BVH lod spheres are not monotonic ({sphere_error})" + ); + } + verify_bvh(out, cull_data, reachable, node.aabbs[i].child_offset); + } else { + for m in 0..node.child_counts[i] as u32 { + let mid = (m + node.aabbs[i].child_offset) as usize; + let meshlet = &cull_data[mid]; + assert!(meshlet.error <= error, "meshlet errors are not monotonic"); + let sphere_error = (Vec3A::from(sphere.center) - meshlet.lod_group_sphere.center) + .length() + - (sphere.radius - meshlet.lod_group_sphere.radius()); + assert!( + sphere_error <= 0.0001, + "meshlet lod spheres are not monotonic: ({sphere_error})" + ); + reachable[mid] = true; + } + } + } +} + +fn aabb_default() -> Aabb3d { + Aabb3d { + min: Vec3A::INFINITY, + max: Vec3A::NEG_INFINITY, + } +} + +fn aabb_to_meshlet(aabb: Aabb3d, error: f32, child_offset: u32) -> MeshletAabbErrorOffset { + MeshletAabbErrorOffset { + center: aabb.center().into(), + error, + half_extent: aabb.half_size().into(), + child_offset, + } +} + +fn sphere_to_meshlet(sphere: BoundingSphere) -> MeshletBoundingSphere { MeshletBoundingSphere { - center: bounds.center.into(), - radius: bounds.radius, + center: sphere.center.into(), + radius: sphere.radius(), } } diff --git a/crates/bevy_pbr/src/meshlet/instance_manager.rs b/crates/bevy_pbr/src/meshlet/instance_manager.rs index 661d4791aeac7..33e9db1c0ccd3 100644 --- a/crates/bevy_pbr/src/meshlet/instance_manager.rs +++ b/crates/bevy_pbr/src/meshlet/instance_manager.rs @@ -1,6 +1,7 @@ use super::{meshlet_mesh_manager::MeshletMeshManager, MeshletMesh, MeshletMesh3d}; +use crate::DUMMY_MESH_MATERIAL; use crate::{ - material::DUMMY_MESH_MATERIAL, Material, MaterialBindingId, MeshFlags, MeshTransforms, + meshlet::asset::MeshletAabb, Material, MaterialBindingId, MeshFlags, MeshTransforms, MeshUniform, NotShadowCaster, NotShadowReceiver, PreviousGlobalTransform, RenderMaterialBindings, RenderMaterialInstances, }; @@ -17,26 +18,27 @@ use bevy_render::{ render_resource::StorageBuffer, sync_world::MainEntity, view::RenderLayers, MainWorld, }; use bevy_transform::components::GlobalTransform; -use core::ops::{DerefMut, Range}; +use core::ops::DerefMut; /// Manages data for each entity with a [`MeshletMesh`]. #[derive(Resource)] pub struct InstanceManager { /// Amount of instances in the scene. pub scene_instance_count: u32, - /// Amount of clusters in the scene. - pub scene_cluster_count: u32, + /// The max BVH depth of any instance in the scene. This is used to control the number of + /// dependent dispatches emitted for BVH traversal. + pub max_bvh_depth: u32, /// Per-instance [`MainEntity`], [`RenderLayers`], and [`NotShadowCaster`]. pub instances: Vec<(MainEntity, RenderLayers, bool)>, /// Per-instance [`MeshUniform`]. pub instance_uniforms: StorageBuffer>, + /// Per-instance model-space AABB. + pub instance_aabbs: StorageBuffer>, /// Per-instance material ID. pub instance_material_ids: StorageBuffer>, - /// Per-instance count of meshlets in the instance's [`MeshletMesh`]. - pub instance_meshlet_counts: StorageBuffer>, - /// Per-instance index to the start of the instance's slice of the meshlets buffer. - pub instance_meshlet_slice_starts: StorageBuffer>, + /// Per-instance index to the root node of the instance's BVH. + pub instance_bvh_root_nodes: StorageBuffer>, /// Per-view per-instance visibility bit. Used for [`RenderLayers`] and [`NotShadowCaster`] support. pub view_instance_visibility: EntityHashMap>>, @@ -52,7 +54,7 @@ impl InstanceManager { pub fn new() -> Self { Self { scene_instance_count: 0, - scene_cluster_count: 0, + max_bvh_depth: 0, instances: Vec::new(), instance_uniforms: { @@ -60,19 +62,19 @@ impl InstanceManager { buffer.set_label(Some("meshlet_instance_uniforms")); buffer }, - instance_material_ids: { + instance_aabbs: { let mut buffer = StorageBuffer::default(); - buffer.set_label(Some("meshlet_instance_material_ids")); + buffer.set_label(Some("meshlet_instance_aabbs")); buffer }, - instance_meshlet_counts: { + instance_material_ids: { let mut buffer = StorageBuffer::default(); - buffer.set_label(Some("meshlet_instance_meshlet_counts")); + buffer.set_label(Some("meshlet_instance_material_ids")); buffer }, - instance_meshlet_slice_starts: { + instance_bvh_root_nodes: { let mut buffer = StorageBuffer::default(); - buffer.set_label(Some("meshlet_instance_meshlet_slice_starts")); + buffer.set_label(Some("meshlet_instance_bvh_root_nodes")); buffer }, view_instance_visibility: EntityHashMap::default(), @@ -86,7 +88,9 @@ impl InstanceManager { pub fn add_instance( &mut self, instance: MainEntity, - meshlets_slice: Range, + root_bvh_node: u32, + aabb: MeshletAabb, + bvh_depth: u32, transform: &GlobalTransform, previous_transform: Option<&PreviousGlobalTransform>, render_layers: Option<&RenderLayers>, @@ -139,16 +143,12 @@ impl InstanceManager { not_shadow_caster, )); self.instance_uniforms.get_mut().push(mesh_uniform); + self.instance_aabbs.get_mut().push(aabb); self.instance_material_ids.get_mut().push(0); - self.instance_meshlet_counts - .get_mut() - .push(meshlets_slice.len() as u32); - self.instance_meshlet_slice_starts - .get_mut() - .push(meshlets_slice.start); + self.instance_bvh_root_nodes.get_mut().push(root_bvh_node); self.scene_instance_count += 1; - self.scene_cluster_count += meshlets_slice.len() as u32; + self.max_bvh_depth = self.max_bvh_depth.max(bvh_depth); } /// Get the material ID for a [`crate::Material`]. @@ -168,13 +168,13 @@ impl InstanceManager { pub fn reset(&mut self, entities: &Entities) { self.scene_instance_count = 0; - self.scene_cluster_count = 0; + self.max_bvh_depth = 0; self.instances.clear(); self.instance_uniforms.get_mut().clear(); + self.instance_aabbs.get_mut().clear(); self.instance_material_ids.get_mut().clear(); - self.instance_meshlet_counts.get_mut().clear(); - self.instance_meshlet_slice_starts.get_mut().clear(); + self.instance_bvh_root_nodes.get_mut().clear(); self.view_instance_visibility .retain(|view_entity, _| entities.contains(*view_entity)); self.view_instance_visibility @@ -233,6 +233,7 @@ pub fn extract_meshlet_mesh_entities( } // Iterate over every instance + // TODO: Switch to change events to not upload every instance every frame. for ( instance, meshlet_mesh, @@ -252,13 +253,15 @@ pub fn extract_meshlet_mesh_entities( } // Upload the instance's MeshletMesh asset data if not done already done - let meshlets_slice = + let (root_bvh_node, aabb, bvh_depth) = meshlet_mesh_manager.queue_upload_if_needed(meshlet_mesh.id(), &mut assets); // Add the instance's data to the instance manager instance_manager.add_instance( instance.into(), - meshlets_slice, + root_bvh_node, + aabb, + bvh_depth, transform, previous_transform, render_layers, diff --git a/crates/bevy_pbr/src/meshlet/material_pipeline_prepare.rs b/crates/bevy_pbr/src/meshlet/material_pipeline_prepare.rs index 57762bfc8a609..c49e5558bffc5 100644 --- a/crates/bevy_pbr/src/meshlet/material_pipeline_prepare.rs +++ b/crates/bevy_pbr/src/meshlet/material_pipeline_prepare.rs @@ -1,6 +1,6 @@ use super::{ - instance_manager::InstanceManager, resource_manager::ResourceManager, - MESHLET_MESH_MATERIAL_SHADER_HANDLE, + instance_manager::InstanceManager, pipelines::MeshletPipelines, + resource_manager::ResourceManager, }; use crate::{ environment_map::EnvironmentMapLight, irradiance_volume::IrradianceVolume, @@ -36,6 +36,7 @@ pub fn prepare_material_meshlet_meshes_main_opaque_pass( pipeline_cache: Res, material_pipeline: Res>, mesh_pipeline: Res, + meshlet_pipelines: Res, render_materials: Res>>, render_material_instances: Res, material_bind_group_allocator: Res>, @@ -195,7 +196,7 @@ pub fn prepare_material_meshlet_meshes_main_opaque_pass( ], push_constant_ranges: vec![], vertex: VertexState { - shader: MESHLET_MESH_MATERIAL_SHADER_HANDLE, + shader: meshlet_pipelines.meshlet_mesh_material.clone(), shader_defs: shader_defs.clone(), entry_point: material_pipeline_descriptor.vertex.entry_point, buffers: Vec::new(), @@ -211,7 +212,7 @@ pub fn prepare_material_meshlet_meshes_main_opaque_pass( multisample: MultisampleState::default(), fragment: Some(FragmentState { shader: match M::meshlet_mesh_fragment_shader() { - ShaderRef::Default => MESHLET_MESH_MATERIAL_SHADER_HANDLE, + ShaderRef::Default => meshlet_pipelines.meshlet_mesh_material.clone(), ShaderRef::Handle(handle) => handle, ShaderRef::Path(path) => asset_server.load(path), }, @@ -259,6 +260,7 @@ pub fn prepare_material_meshlet_meshes_prepass( mut instance_manager: ResMut, mut cache: Local>, pipeline_cache: Res, + meshlet_pipelines: Res, prepass_pipeline: Res>, render_materials: Res>>, render_material_instances: Res, @@ -374,7 +376,7 @@ pub fn prepare_material_meshlet_meshes_prepass( ], push_constant_ranges: vec![], vertex: VertexState { - shader: MESHLET_MESH_MATERIAL_SHADER_HANDLE, + shader: meshlet_pipelines.meshlet_mesh_material.clone(), shader_defs: shader_defs.clone(), entry_point: material_pipeline_descriptor.vertex.entry_point, buffers: Vec::new(), @@ -390,7 +392,7 @@ pub fn prepare_material_meshlet_meshes_prepass( multisample: MultisampleState::default(), fragment: Some(FragmentState { shader: match fragment_shader { - ShaderRef::Default => MESHLET_MESH_MATERIAL_SHADER_HANDLE, + ShaderRef::Default => meshlet_pipelines.meshlet_mesh_material.clone(), ShaderRef::Handle(handle) => handle, ShaderRef::Path(path) => asset_server.load(path), }, diff --git a/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl b/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl index e179e78b7ae5e..4958cc0094f89 100644 --- a/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl +++ b/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl @@ -5,6 +5,13 @@ #import bevy_pbr::prepass_bindings::PreviousViewUniforms #import bevy_pbr::utils::octahedral_decode_signed +struct BvhNode { + aabbs: array, + lod_bounds: array, 8>, + child_counts: array, + _padding: vec2, +} + struct Meshlet { start_vertex_position_bit: u32, start_vertex_attribute_id: u32, @@ -24,15 +31,34 @@ fn get_meshlet_triangle_count(meshlet: ptr) -> u32 { return extractBits((*meshlet).packed_a, 8u, 8u); } -struct MeshletBoundingSpheres { - culling_sphere: MeshletBoundingSphere, - lod_group_sphere: MeshletBoundingSphere, - lod_parent_group_sphere: MeshletBoundingSphere, +struct MeshletCullData { + aabb: MeshletAabbErrorOffset, + lod_group_sphere: vec4, } -struct MeshletBoundingSphere { +struct MeshletAabb { center: vec3, - radius: f32, + half_extent: vec3, +} + +struct MeshletAabbErrorOffset { + center_and_error: vec4, + half_extent_and_child_offset: vec4, +} + +fn get_aabb(aabb: ptr) -> MeshletAabb { + return MeshletAabb( + (*aabb).center_and_error.xyz, + (*aabb).half_extent_and_child_offset.xyz, + ); +} + +fn get_aabb_error(aabb: ptr) -> f32 { + return (*aabb).center_and_error.w; +} + +fn get_aabb_child_offset(aabb: ptr) -> u32 { + return bitcast((*aabb).half_extent_and_child_offset.w); } struct DispatchIndirectArgs { @@ -48,63 +74,128 @@ struct DrawIndirectArgs { first_instance: u32, } +struct InstancedOffset { + instance_id: u32, + offset: u32, +} + const CENTIMETERS_PER_METER = 100.0; -#ifdef MESHLET_FILL_CLUSTER_BUFFERS_PASS -var scene_instance_count: u32; -@group(0) @binding(0) var meshlet_instance_meshlet_counts: array; // Per entity instance -@group(0) @binding(1) var meshlet_instance_meshlet_slice_starts: array; // Per entity instance -@group(0) @binding(2) var meshlet_cluster_instance_ids: array; // Per cluster -@group(0) @binding(3) var meshlet_cluster_meshlet_ids: array; // Per cluster -@group(0) @binding(4) var meshlet_global_cluster_count: atomic; // Single object shared between all workgroups +#ifdef MESHLET_INSTANCE_CULLING_PASS +struct Constants { scene_instance_count: u32 } +var constants: Constants; + +// Cull data +@group(0) @binding(0) var depth_pyramid: texture_2d; +@group(0) @binding(1) var view: View; +@group(0) @binding(2) var previous_view: PreviousViewUniforms; + +// Per entity instance data +@group(0) @binding(3) var meshlet_instance_uniforms: array; +@group(0) @binding(4) var meshlet_view_instance_visibility: array; // 1 bit per entity instance, packed as a bitmask +@group(0) @binding(5) var meshlet_instance_aabbs: array; +@group(0) @binding(6) var meshlet_instance_bvh_root_nodes: array; + +// BVH cull queue data +@group(0) @binding(7) var meshlet_bvh_cull_count_write: atomic; +@group(0) @binding(8) var meshlet_bvh_cull_dispatch: DispatchIndirectArgs; +@group(0) @binding(9) var meshlet_bvh_cull_queue: array; + +// Second pass queue data +#ifdef MESHLET_FIRST_CULLING_PASS +@group(0) @binding(10) var meshlet_second_pass_instance_count: atomic; +@group(0) @binding(11) var meshlet_second_pass_instance_dispatch: DispatchIndirectArgs; +@group(0) @binding(12) var meshlet_second_pass_instance_candidates: array; +#else +@group(0) @binding(10) var meshlet_second_pass_instance_count: u32; +@group(0) @binding(11) var meshlet_second_pass_instance_candidates: array; +#endif #endif -#ifdef MESHLET_CULLING_PASS -struct Constants { scene_cluster_count: u32, meshlet_raster_cluster_rightmost_slot: u32 } +#ifdef MESHLET_BVH_CULLING_PASS +struct Constants { read_from_front: u32, rightmost_slot: u32 } var constants: Constants; -@group(0) @binding(0) var meshlet_cluster_meshlet_ids: array; // Per cluster -@group(0) @binding(1) var meshlet_bounding_spheres: array; // Per meshlet -@group(0) @binding(2) var meshlet_simplification_errors: array; // Per meshlet -@group(0) @binding(3) var meshlet_cluster_instance_ids: array; // Per cluster -@group(0) @binding(4) var meshlet_instance_uniforms: array; // Per entity instance -@group(0) @binding(5) var meshlet_view_instance_visibility: array; // 1 bit per entity instance, packed as a bitmask -@group(0) @binding(6) var meshlet_second_pass_candidates: array>; // 1 bit per cluster , packed as a bitmask -@group(0) @binding(7) var meshlet_software_raster_indirect_args: DispatchIndirectArgs; // Single object shared between all workgroups -@group(0) @binding(8) var meshlet_hardware_raster_indirect_args: DrawIndirectArgs; // Single object shared between all workgroups -@group(0) @binding(9) var meshlet_raster_clusters: array; // Single object shared between all workgroups -@group(0) @binding(10) var depth_pyramid: texture_2d; // From the end of the last frame for the first culling pass, and from the first raster pass for the second culling pass -@group(0) @binding(11) var view: View; -@group(0) @binding(12) var previous_view: PreviousViewUniforms; - -fn should_cull_instance(instance_id: u32) -> bool { - let bit_offset = instance_id % 32u; - let packed_visibility = meshlet_view_instance_visibility[instance_id / 32u]; - return bool(extractBits(packed_visibility, bit_offset, 1u)); -} - -// TODO: Load 4x per workgroup instead of once per thread? -fn cluster_is_second_pass_candidate(cluster_id: u32) -> bool { - let packed_candidates = meshlet_second_pass_candidates[cluster_id / 32u]; - let bit_offset = cluster_id % 32u; - return bool(extractBits(packed_candidates, bit_offset, 1u)); -} + +// Cull data +@group(0) @binding(0) var depth_pyramid: texture_2d; // From the end of the last frame for the first culling pass, and from the first raster pass for the second culling pass +@group(0) @binding(1) var view: View; +@group(0) @binding(2) var previous_view: PreviousViewUniforms; + +// Global mesh data +@group(0) @binding(3) var meshlet_bvh_nodes: array; + +// Per entity instance data +@group(0) @binding(4) var meshlet_instance_uniforms: array; + +// BVH cull queue data +@group(0) @binding(5) var meshlet_bvh_cull_count_read: u32; +@group(0) @binding(6) var meshlet_bvh_cull_count_write: atomic; +@group(0) @binding(7) var meshlet_bvh_cull_dispatch: DispatchIndirectArgs; +@group(0) @binding(8) var meshlet_bvh_cull_queue: array; + +// Meshlet cull queue data +@group(0) @binding(9) var meshlet_meshlet_cull_count_early: atomic; +@group(0) @binding(10) var meshlet_meshlet_cull_count_late: atomic; +@group(0) @binding(11) var meshlet_meshlet_cull_dispatch_early: DispatchIndirectArgs; +@group(0) @binding(12) var meshlet_meshlet_cull_dispatch_late: DispatchIndirectArgs; +@group(0) @binding(13) var meshlet_meshlet_cull_queue: array; + +// Second pass queue data +#ifdef MESHLET_FIRST_CULLING_PASS +@group(0) @binding(14) var meshlet_second_pass_bvh_count: atomic; +@group(0) @binding(15) var meshlet_second_pass_bvh_dispatch: DispatchIndirectArgs; +@group(0) @binding(16) var meshlet_second_pass_bvh_queue: array; +#endif +#endif + +#ifdef MESHLET_CLUSTER_CULLING_PASS +struct Constants { rightmost_slot: u32 } +var constants: Constants; + +// Cull data +@group(0) @binding(0) var depth_pyramid: texture_2d; // From the end of the last frame for the first culling pass, and from the first raster pass for the second culling pass +@group(0) @binding(1) var view: View; +@group(0) @binding(2) var previous_view: PreviousViewUniforms; + +// Global mesh data +@group(0) @binding(3) var meshlet_cull_data: array; + +// Per entity instance data +@group(0) @binding(4) var meshlet_instance_uniforms: array; + +// Raster queue data +@group(0) @binding(5) var meshlet_software_raster_indirect_args: DispatchIndirectArgs; +@group(0) @binding(6) var meshlet_hardware_raster_indirect_args: DrawIndirectArgs; +@group(0) @binding(7) var meshlet_previous_raster_counts: array; +@group(0) @binding(8) var meshlet_raster_clusters: array; + +// Meshlet cull queue data +@group(0) @binding(9) var meshlet_meshlet_cull_count_read: u32; + +// Second pass queue data +#ifdef MESHLET_FIRST_CULLING_PASS +@group(0) @binding(10) var meshlet_meshlet_cull_count_write: atomic; +@group(0) @binding(11) var meshlet_meshlet_cull_dispatch: DispatchIndirectArgs; +@group(0) @binding(12) var meshlet_meshlet_cull_queue: array; +#else +@group(0) @binding(10) var meshlet_meshlet_cull_queue: array; +#endif #endif #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS -@group(0) @binding(0) var meshlet_cluster_meshlet_ids: array; // Per cluster +@group(0) @binding(0) var meshlet_raster_clusters: array; // Per cluster @group(0) @binding(1) var meshlets: array; // Per meshlet @group(0) @binding(2) var meshlet_indices: array; // Many per meshlet @group(0) @binding(3) var meshlet_vertex_positions: array; // Many per meshlet -@group(0) @binding(4) var meshlet_cluster_instance_ids: array; // Per cluster -@group(0) @binding(5) var meshlet_instance_uniforms: array; // Per entity instance -@group(0) @binding(6) var meshlet_raster_clusters: array; // Single object shared between all workgroups -@group(0) @binding(7) var meshlet_software_raster_cluster_count: u32; +@group(0) @binding(4) var meshlet_instance_uniforms: array; // Per entity instance +@group(0) @binding(5) var meshlet_previous_raster_counts: array; +@group(0) @binding(6) var meshlet_software_raster_cluster_count: u32; #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT -@group(0) @binding(8) var meshlet_visibility_buffer: texture_storage_2d; +@group(0) @binding(7) var meshlet_visibility_buffer: texture_storage_2d; #else -@group(0) @binding(8) var meshlet_visibility_buffer: texture_storage_2d; +@group(0) @binding(7) var meshlet_visibility_buffer: texture_storage_2d; #endif -@group(0) @binding(9) var view: View; +@group(0) @binding(8) var view: View; // TODO: Load only twice, instead of 3x in cases where you load 3 indices per thread? fn get_meshlet_vertex_id(index_id: u32) -> u32 { @@ -150,14 +241,13 @@ fn get_meshlet_vertex_position(meshlet: ptr, vertex_id: u32) #ifdef MESHLET_MESH_MATERIAL_PASS @group(1) @binding(0) var meshlet_visibility_buffer: texture_storage_2d; -@group(1) @binding(1) var meshlet_cluster_meshlet_ids: array; // Per cluster +@group(1) @binding(1) var meshlet_raster_clusters: array; // Per cluster @group(1) @binding(2) var meshlets: array; // Per meshlet @group(1) @binding(3) var meshlet_indices: array; // Many per meshlet @group(1) @binding(4) var meshlet_vertex_positions: array; // Many per meshlet @group(1) @binding(5) var meshlet_vertex_normals: array; // Many per meshlet @group(1) @binding(6) var meshlet_vertex_uvs: array>; // Many per meshlet -@group(1) @binding(7) var meshlet_cluster_instance_ids: array; // Per cluster -@group(1) @binding(8) var meshlet_instance_uniforms: array; // Per entity instance +@group(1) @binding(7) var meshlet_instance_uniforms: array; // Per entity instance // TODO: Load only twice, instead of 3x in cases where you load 3 indices per thread? fn get_meshlet_vertex_id(index_id: u32) -> u32 { diff --git a/crates/bevy_pbr/src/meshlet/meshlet_cull_shared.wgsl b/crates/bevy_pbr/src/meshlet/meshlet_cull_shared.wgsl new file mode 100644 index 0000000000000..8f5af644ba170 --- /dev/null +++ b/crates/bevy_pbr/src/meshlet/meshlet_cull_shared.wgsl @@ -0,0 +1,205 @@ +#define_import_path bevy_pbr::meshlet_cull_shared + +#import bevy_pbr::meshlet_bindings::{ + MeshletAabb, + DispatchIndirectArgs, + InstancedOffset, + depth_pyramid, + view, + previous_view, + meshlet_instance_uniforms, +} +#import bevy_render::maths::affine3_to_square + +// https://github.com/zeux/meshoptimizer/blob/1e48e96c7e8059321de492865165e9ef071bffba/demo/nanite.cpp#L115 +fn lod_error_is_imperceptible(lod_sphere: vec4, simplification_error: f32, instance_id: u32) -> bool { + let world_from_local = affine3_to_square(meshlet_instance_uniforms[instance_id].world_from_local); + let world_scale = max(length(world_from_local[0]), max(length(world_from_local[1]), length(world_from_local[2]))); + let camera_pos = view.world_position; + + let projection = view.clip_from_view; + if projection[3][3] == 1.0 { + let world_error = simplification_error * world_scale; + let proj = projection[1][1]; + let height = 2.0 / proj; + let norm_error = world_error / height; + return norm_error * view.viewport.w < 1.0; + } else { + var near = projection[3][2]; + let world_sphere_center = (world_from_local * vec4(lod_sphere.xyz, 1.0)).xyz; + let world_sphere_radius = lod_sphere.w * world_scale; + let d_pos = world_sphere_center - camera_pos; + let d = sqrt(dot(d_pos, d_pos)) - world_sphere_radius; + let norm_error = simplification_error / max(d, near) * projection[1][1] * 0.5; + return norm_error * view.viewport.w < 1.0; + } +} + +fn normalize_plane(p: vec4) -> vec4 { + return p / length(p.xyz); +} + +// https://fgiesen.wordpress.com/2012/08/31/frustum-planes-from-the-projection-matrix/ +// https://fgiesen.wordpress.com/2010/10/17/view-frustum-culling/ +fn aabb_in_frustum(aabb: MeshletAabb, instance_id: u32) -> bool { + let world_from_local = affine3_to_square(meshlet_instance_uniforms[instance_id].world_from_local); + let clip_from_local = view.clip_from_world * world_from_local; + let row_major = transpose(clip_from_local); + let planes = array( + row_major[3] + row_major[0], + row_major[3] - row_major[0], + row_major[3] + row_major[1], + row_major[3] - row_major[1], + row_major[2], + ); + + for (var i = 0; i < 5; i++) { + let plane = normalize_plane(planes[i]); + let flipped = aabb.half_extent * sign(plane.xyz); + if dot(aabb.center + flipped, plane.xyz) <= -plane.w { + return false; + } + } + return true; +} + +struct ScreenAabb { + min: vec3, + max: vec3, +} + +fn min8(a: vec3, b: vec3, c: vec3, d: vec3, e: vec3, f: vec3, g: vec3, h: vec3) -> vec3 { + return min(min(min(a, b), min(c, d)), min(min(e, f), min(g, h))); +} + +fn max8(a: vec3, b: vec3, c: vec3, d: vec3, e: vec3, f: vec3, g: vec3, h: vec3) -> vec3 { + return max(max(max(a, b), max(c, d)), max(max(e, f), max(g, h))); +} + +fn min8_4(a: vec4, b: vec4, c: vec4, d: vec4, e: vec4, f: vec4, g: vec4, h: vec4) -> vec4 { + return min(min(min(a, b), min(c, d)), min(min(e, f), min(g, h))); +} + +// https://zeux.io/2023/01/12/approximate-projected-bounds/ +fn project_aabb(clip_from_local: mat4x4, near: f32, aabb: MeshletAabb, out: ptr) -> bool { + let extent = aabb.half_extent * 2.0; + let sx = clip_from_local * vec4(extent.x, 0.0, 0.0, 0.0); + let sy = clip_from_local * vec4(0.0, extent.y, 0.0, 0.0); + let sz = clip_from_local * vec4(0.0, 0.0, extent.z, 0.0); + + let p0 = clip_from_local * vec4(aabb.center - aabb.half_extent, 1.0); + let p1 = p0 + sz; + let p2 = p0 + sy; + let p3 = p2 + sz; + let p4 = p0 + sx; + let p5 = p4 + sz; + let p6 = p4 + sy; + let p7 = p6 + sz; + + let depth = min8_4(p0, p1, p2, p3, p4, p5, p6, p7).w; + // do not occlusion cull if we are inside the aabb + if depth < near { + return false; + } + + let dp0 = p0.xyz / p0.w; + let dp1 = p1.xyz / p1.w; + let dp2 = p2.xyz / p2.w; + let dp3 = p3.xyz / p3.w; + let dp4 = p4.xyz / p4.w; + let dp5 = p5.xyz / p5.w; + let dp6 = p6.xyz / p6.w; + let dp7 = p7.xyz / p7.w; + let min = min8(dp0, dp1, dp2, dp3, dp4, dp5, dp6, dp7); + let max = max8(dp0, dp1, dp2, dp3, dp4, dp5, dp6, dp7); + var vaabb = vec4(min.xy, max.xy); + // convert ndc to texture coordinates by rescaling and flipping Y + vaabb = vaabb.xwzy * vec4(0.5, -0.5, 0.5, -0.5) + 0.5; + (*out).min = vec3(vaabb.xy, min.z); + (*out).max = vec3(vaabb.zw, max.z); + return true; +} + +fn sample_hzb(smin: vec2, smax: vec2, mip: i32) -> f32 { + let texel = vec4(0, 1, 2, 3); + let sx = min(smin.x + texel, smax.xxxx); + let sy = min(smin.y + texel, smax.yyyy); + // TODO: switch to min samplers when wgpu has them + // sampling 16 times a finer mip is worth the extra cost for better culling + let a = sample_hzb_row(sx, sy.x, mip); + let b = sample_hzb_row(sx, sy.y, mip); + let c = sample_hzb_row(sx, sy.z, mip); + let d = sample_hzb_row(sx, sy.w, mip); + return min(min(a, b), min(c, d)); +} + +fn sample_hzb_row(sx: vec4, sy: u32, mip: i32) -> f32 { + let a = textureLoad(depth_pyramid, vec2(sx.x, sy), mip).x; + let b = textureLoad(depth_pyramid, vec2(sx.y, sy), mip).x; + let c = textureLoad(depth_pyramid, vec2(sx.z, sy), mip).x; + let d = textureLoad(depth_pyramid, vec2(sx.w, sy), mip).x; + return min(min(a, b), min(c, d)); +} + +// TODO: We should probably be using a POT HZB texture? +fn occlusion_cull_screen_aabb(aabb: ScreenAabb, screen: vec2) -> bool { + let hzb_size = ceil(screen * 0.5); + let aabb_min = aabb.min.xy * hzb_size; + let aabb_max = aabb.max.xy * hzb_size; + + let min_texel = vec2(max(aabb_min, vec2(0.0))); + let max_texel = vec2(min(aabb_max, hzb_size - 1.0)); + let size = max_texel - min_texel; + let max_size = max(size.x, size.y); + + // note: add 1 before max because the unsigned overflow behavior is intentional + // it wraps around firstLeadingBit(0) = ~0 to 0 + // TODO: we actually sample a 4x4 block, so ideally this would be `max(..., 3u) - 3u`. + // However, since our HZB is not a power of two, we need to be extra-conservative to not over-cull, so we go up a mip. + var mip = max(firstLeadingBit(max_size) + 1u, 2u) - 2u; + + if any((max_texel >> vec2(mip)) > (min_texel >> vec2(mip)) + 3) { + mip += 1u; + } + + let smin = min_texel >> vec2(mip); + let smax = max_texel >> vec2(mip); + + let curr_depth = sample_hzb(smin, smax, i32(mip)); + return aabb.max.z <= curr_depth; +} + +fn occlusion_cull_projection() -> mat4x4 { +#ifdef FIRST_CULLING_PASS + return view.clip_from_world; +#else + return previous_view.clip_from_world; +#endif +} + +fn occlusion_cull_clip_from_local(instance_id: u32) -> mat4x4 { +#ifdef FIRST_CULLING_PASS + let prev_world_from_local = affine3_to_square(meshlet_instance_uniforms[instance_id].previous_world_from_local); + return previous_view.clip_from_world * prev_world_from_local; +#else + let world_from_local = affine3_to_square(meshlet_instance_uniforms[instance_id].world_from_local); + return view.clip_from_world * world_from_local; +#endif +} + +fn should_occlusion_cull_aabb(aabb: MeshletAabb, instance_id: u32) -> bool { + let projection = occlusion_cull_projection(); + var near: f32; + if projection[3][3] == 1.0 { + near = projection[3][2] / projection[2][2]; + } else { + near = projection[3][2]; + } + + let clip_from_local = occlusion_cull_clip_from_local(instance_id); + var screen_aabb = ScreenAabb(vec3(0.0), vec3(0.0)); + if project_aabb(clip_from_local, near, aabb, &screen_aabb) { + return occlusion_cull_screen_aabb(screen_aabb, view.viewport.zw); + } + return false; +} diff --git a/crates/bevy_pbr/src/meshlet/meshlet_mesh_manager.rs b/crates/bevy_pbr/src/meshlet/meshlet_mesh_manager.rs index 0f4aab7509a82..93eb5a1afe185 100644 --- a/crates/bevy_pbr/src/meshlet/meshlet_mesh_manager.rs +++ b/crates/bevy_pbr/src/meshlet/meshlet_mesh_manager.rs @@ -1,8 +1,6 @@ -use super::{ - asset::{Meshlet, MeshletBoundingSpheres, MeshletSimplificationError}, - persistent_buffer::PersistentGpuBuffer, - MeshletMesh, -}; +use crate::meshlet::asset::{BvhNode, MeshletAabb, MeshletCullData}; + +use super::{asset::Meshlet, persistent_buffer::PersistentGpuBuffer, MeshletMesh}; use alloc::sync::Arc; use bevy_asset::{AssetId, Assets}; use bevy_ecs::{ @@ -25,10 +23,11 @@ pub struct MeshletMeshManager { pub vertex_normals: PersistentGpuBuffer>, pub vertex_uvs: PersistentGpuBuffer>, pub indices: PersistentGpuBuffer>, + pub bvh_nodes: PersistentGpuBuffer>, pub meshlets: PersistentGpuBuffer>, - pub meshlet_bounding_spheres: PersistentGpuBuffer>, - pub meshlet_simplification_errors: PersistentGpuBuffer>, - meshlet_mesh_slices: HashMap, [Range; 7]>, + pub meshlet_cull_data: PersistentGpuBuffer>, + meshlet_mesh_slices: + HashMap, ([Range; 7], MeshletAabb, u32)>, } impl FromWorld for MeshletMeshManager { @@ -39,26 +38,21 @@ impl FromWorld for MeshletMeshManager { vertex_normals: PersistentGpuBuffer::new("meshlet_vertex_normals", render_device), vertex_uvs: PersistentGpuBuffer::new("meshlet_vertex_uvs", render_device), indices: PersistentGpuBuffer::new("meshlet_indices", render_device), + bvh_nodes: PersistentGpuBuffer::new("meshlet_bvh_nodes", render_device), meshlets: PersistentGpuBuffer::new("meshlets", render_device), - meshlet_bounding_spheres: PersistentGpuBuffer::new( - "meshlet_bounding_spheres", - render_device, - ), - meshlet_simplification_errors: PersistentGpuBuffer::new( - "meshlet_simplification_errors", - render_device, - ), + meshlet_cull_data: PersistentGpuBuffer::new("meshlet_cull_data", render_device), meshlet_mesh_slices: HashMap::default(), } } } impl MeshletMeshManager { + // Returns the index of the root BVH node, as well as the depth of the BVH. pub fn queue_upload_if_needed( &mut self, asset_id: AssetId, assets: &mut Assets, - ) -> Range { + ) -> (u32, MeshletAabb, u32) { let queue_meshlet_mesh = |asset_id: &AssetId| { let meshlet_mesh = assets.remove_untracked(*asset_id).expect( "MeshletMesh asset was already unloaded but is not registered with MeshletMeshManager", @@ -84,51 +78,59 @@ impl MeshletMeshManager { indices_slice.start, ), ); - let meshlet_bounding_spheres_slice = self - .meshlet_bounding_spheres - .queue_write(Arc::clone(&meshlet_mesh.meshlet_bounding_spheres), ()); - let meshlet_simplification_errors_slice = self - .meshlet_simplification_errors - .queue_write(Arc::clone(&meshlet_mesh.meshlet_simplification_errors), ()); + let base_meshlet_index = (meshlets_slice.start / size_of::() as u64) as u32; + let bvh_node_slice = self + .bvh_nodes + .queue_write(Arc::clone(&meshlet_mesh.bvh), base_meshlet_index); + let meshlet_cull_data_slice = self + .meshlet_cull_data + .queue_write(Arc::clone(&meshlet_mesh.meshlet_cull_data), ()); - [ - vertex_positions_slice, - vertex_normals_slice, - vertex_uvs_slice, - indices_slice, - meshlets_slice, - meshlet_bounding_spheres_slice, - meshlet_simplification_errors_slice, - ] + ( + [ + vertex_positions_slice, + vertex_normals_slice, + vertex_uvs_slice, + indices_slice, + bvh_node_slice, + meshlets_slice, + meshlet_cull_data_slice, + ], + meshlet_mesh.aabb, + meshlet_mesh.bvh_depth, + ) }; // If the MeshletMesh asset has not been uploaded to the GPU yet, queue it for uploading - let [_, _, _, _, meshlets_slice, _, _] = self + let ([_, _, _, _, bvh_node_slice, _, _], aabb, bvh_depth) = self .meshlet_mesh_slices .entry(asset_id) .or_insert_with_key(queue_meshlet_mesh) .clone(); - let meshlets_slice_start = meshlets_slice.start as u32 / size_of::() as u32; - let meshlets_slice_end = meshlets_slice.end as u32 / size_of::() as u32; - meshlets_slice_start..meshlets_slice_end + ( + (bvh_node_slice.start / size_of::() as u64) as u32, + aabb, + bvh_depth, + ) } pub fn remove(&mut self, asset_id: &AssetId) { - if let Some( - [vertex_positions_slice, vertex_normals_slice, vertex_uvs_slice, indices_slice, meshlets_slice, meshlet_bounding_spheres_slice, meshlet_simplification_errors_slice], - ) = self.meshlet_mesh_slices.remove(asset_id) + if let Some(( + [vertex_positions_slice, vertex_normals_slice, vertex_uvs_slice, indices_slice, bvh_node_slice, meshlets_slice, meshlet_cull_data_slice], + _, + _, + )) = self.meshlet_mesh_slices.remove(asset_id) { self.vertex_positions .mark_slice_unused(vertex_positions_slice); self.vertex_normals.mark_slice_unused(vertex_normals_slice); self.vertex_uvs.mark_slice_unused(vertex_uvs_slice); self.indices.mark_slice_unused(indices_slice); + self.bvh_nodes.mark_slice_unused(bvh_node_slice); self.meshlets.mark_slice_unused(meshlets_slice); - self.meshlet_bounding_spheres - .mark_slice_unused(meshlet_bounding_spheres_slice); - self.meshlet_simplification_errors - .mark_slice_unused(meshlet_simplification_errors_slice); + self.meshlet_cull_data + .mark_slice_unused(meshlet_cull_data_slice); } } } @@ -152,12 +154,12 @@ pub fn perform_pending_meshlet_mesh_writes( .indices .perform_writes(&render_queue, &render_device); meshlet_mesh_manager - .meshlets + .bvh_nodes .perform_writes(&render_queue, &render_device); meshlet_mesh_manager - .meshlet_bounding_spheres + .meshlets .perform_writes(&render_queue, &render_device); meshlet_mesh_manager - .meshlet_simplification_errors + .meshlet_cull_data .perform_writes(&render_queue, &render_device); } diff --git a/crates/bevy_pbr/src/meshlet/mod.rs b/crates/bevy_pbr/src/meshlet/mod.rs index 2375894613e52..f5e42f171088e 100644 --- a/crates/bevy_pbr/src/meshlet/mod.rs +++ b/crates/bevy_pbr/src/meshlet/mod.rs @@ -58,7 +58,7 @@ use self::{ }; use crate::{graph::NodePbr, PreviousGlobalTransform}; use bevy_app::{App, Plugin}; -use bevy_asset::{load_internal_asset, weak_handle, AssetApp, AssetId, Handle}; +use bevy_asset::{embedded_asset, AssetApp, AssetId, Handle}; use bevy_core_pipeline::{ core_3d::graph::{Core3d, Node3d}, prepass::{DeferredPrepass, MotionVectorPrepass, NormalPrepass}, @@ -74,8 +74,8 @@ use bevy_ecs::{ }; use bevy_reflect::{std_traits::ReflectDefault, Reflect}; use bevy_render::{ + load_shader_library, render_graph::{RenderGraphApp, ViewNodeRunner}, - render_resource::Shader, renderer::RenderDevice, settings::WgpuFeatures, view::{self, prepare_view_targets, Msaa, Visibility, VisibilityClass}, @@ -85,11 +85,6 @@ use bevy_transform::components::Transform; use derive_more::From; use tracing::error; -const MESHLET_BINDINGS_SHADER_HANDLE: Handle = - weak_handle!("d90ac78c-500f-48aa-b488-cc98eb3f6314"); -const MESHLET_MESH_MATERIAL_SHADER_HANDLE: Handle = - weak_handle!("db8d9001-6ca7-4d00-968a-d5f5b96b89c3"); - /// Provides a plugin for rendering large amounts of high-poly 3d meshes using an efficient GPU-driven method. See also [`MeshletMesh`]. /// /// Rendering dense scenes made of high-poly meshes with thousands or millions of triangles is extremely expensive in Bevy's standard renderer. @@ -152,66 +147,19 @@ impl Plugin for MeshletPlugin { std::process::exit(1); } - load_internal_asset!( - app, - MESHLET_CLEAR_VISIBILITY_BUFFER_SHADER_HANDLE, - "clear_visibility_buffer.wgsl", - Shader::from_wgsl - ); - load_internal_asset!( - app, - MESHLET_BINDINGS_SHADER_HANDLE, - "meshlet_bindings.wgsl", - Shader::from_wgsl - ); - load_internal_asset!( - app, - super::MESHLET_VISIBILITY_BUFFER_RESOLVE_SHADER_HANDLE, - "visibility_buffer_resolve.wgsl", - Shader::from_wgsl - ); - load_internal_asset!( - app, - MESHLET_FILL_CLUSTER_BUFFERS_SHADER_HANDLE, - "fill_cluster_buffers.wgsl", - Shader::from_wgsl - ); - load_internal_asset!( - app, - MESHLET_CULLING_SHADER_HANDLE, - "cull_clusters.wgsl", - Shader::from_wgsl - ); - load_internal_asset!( - app, - MESHLET_VISIBILITY_BUFFER_SOFTWARE_RASTER_SHADER_HANDLE, - "visibility_buffer_software_raster.wgsl", - Shader::from_wgsl - ); - load_internal_asset!( - app, - MESHLET_VISIBILITY_BUFFER_HARDWARE_RASTER_SHADER_HANDLE, - "visibility_buffer_hardware_raster.wgsl", - Shader::from_wgsl - ); - load_internal_asset!( - app, - MESHLET_MESH_MATERIAL_SHADER_HANDLE, - "meshlet_mesh_material.wgsl", - Shader::from_wgsl - ); - load_internal_asset!( - app, - MESHLET_RESOLVE_RENDER_TARGETS_SHADER_HANDLE, - "resolve_render_targets.wgsl", - Shader::from_wgsl - ); - load_internal_asset!( - app, - MESHLET_REMAP_1D_TO_2D_DISPATCH_SHADER_HANDLE, - "remap_1d_to_2d_dispatch.wgsl", - Shader::from_wgsl - ); + embedded_asset!(app, "clear_visibility_buffer.wgsl"); + load_shader_library!(app, "meshlet_bindings.wgsl"); + load_shader_library!(app, "visibility_buffer_resolve.wgsl"); + load_shader_library!(app, "meshlet_cull_shared.wgsl"); + embedded_asset!(app, "cull_instances.wgsl"); + embedded_asset!(app, "cull_bvh.wgsl"); + embedded_asset!(app, "cull_clusters.wgsl"); + embedded_asset!(app, "visibility_buffer_software_raster.wgsl"); + embedded_asset!(app, "visibility_buffer_hardware_raster.wgsl"); + embedded_asset!(app, "meshlet_mesh_material.wgsl"); + embedded_asset!(app, "resolve_render_targets.wgsl"); + embedded_asset!(app, "remap_1d_to_2d_dispatch.wgsl"); + embedded_asset!(app, "fill_counts.wgsl"); app.init_asset::() .register_asset_loader(MeshletMeshLoader); diff --git a/crates/bevy_pbr/src/meshlet/persistent_buffer.rs b/crates/bevy_pbr/src/meshlet/persistent_buffer.rs index 85dec457f9808..e8f4669227a0b 100644 --- a/crates/bevy_pbr/src/meshlet/persistent_buffer.rs +++ b/crates/bevy_pbr/src/meshlet/persistent_buffer.rs @@ -71,7 +71,7 @@ impl PersistentGpuBuffer { let mut buffer_view = render_queue .write_buffer_with(&self.buffer, buffer_slice.start, buffer_slice_size) .unwrap(); - data.write_bytes_le(metadata, &mut buffer_view); + data.write_bytes_le(metadata, &mut buffer_view, buffer_slice.start); } let queue_saturation = queue_count as f32 / self.write_queue.capacity() as f32; @@ -123,5 +123,10 @@ pub trait PersistentGpuBufferable { /// Convert `self` + `metadata` into bytes (little-endian), and write to the provided buffer slice. /// Any bytes not written to in the slice will be zeroed out when uploaded to the GPU. - fn write_bytes_le(&self, metadata: Self::Metadata, buffer_slice: &mut [u8]); + fn write_bytes_le( + &self, + metadata: Self::Metadata, + buffer_slice: &mut [u8], + buffer_offset: BufferAddress, + ); } diff --git a/crates/bevy_pbr/src/meshlet/persistent_buffer_impls.rs b/crates/bevy_pbr/src/meshlet/persistent_buffer_impls.rs index 9c2667d3f3d4d..d29e5439bea6f 100644 --- a/crates/bevy_pbr/src/meshlet/persistent_buffer_impls.rs +++ b/crates/bevy_pbr/src/meshlet/persistent_buffer_impls.rs @@ -1,9 +1,46 @@ -use super::{ - asset::{Meshlet, MeshletBoundingSpheres, MeshletSimplificationError}, - persistent_buffer::PersistentGpuBufferable, -}; +use crate::meshlet::asset::{BvhNode, MeshletAabbErrorOffset, MeshletCullData}; + +use super::{asset::Meshlet, persistent_buffer::PersistentGpuBufferable}; use alloc::sync::Arc; use bevy_math::Vec2; +use bevy_render::render_resource::BufferAddress; + +impl PersistentGpuBufferable for Arc<[BvhNode]> { + type Metadata = u32; + + fn size_in_bytes(&self) -> usize { + self.len() * size_of::() + } + + fn write_bytes_le( + &self, + base_meshlet_index: Self::Metadata, + buffer_slice: &mut [u8], + buffer_offset: BufferAddress, + ) { + let size = size_of::(); + let base_bvh_node_index = (buffer_offset / size as u64) as u32; + for (i, &node) in self.iter().enumerate() { + let bytes = bytemuck::cast::<_, [u8; size_of::()]>(BvhNode { + aabbs: core::array::from_fn(|i| { + let aabb = node.aabbs[i]; + MeshletAabbErrorOffset { + child_offset: aabb.child_offset + + if node.child_counts[i] == u8::MAX { + base_bvh_node_index + } else { + base_meshlet_index + }, + ..aabb + } + }), + ..node + }); + let i = i * size; + buffer_slice[i..(i + size)].clone_from_slice(&bytes); + } + } +} impl PersistentGpuBufferable for Arc<[Meshlet]> { type Metadata = (u64, u64, u64); @@ -16,6 +53,7 @@ impl PersistentGpuBufferable for Arc<[Meshlet]> { &self, (vertex_position_offset, vertex_attribute_offset, index_offset): Self::Metadata, buffer_slice: &mut [u8], + _: BufferAddress, ) { let vertex_position_offset = (vertex_position_offset * 8) as u32; let vertex_attribute_offset = (vertex_attribute_offset as usize / size_of::()) as u32; @@ -37,62 +75,50 @@ impl PersistentGpuBufferable for Arc<[Meshlet]> { } } -impl PersistentGpuBufferable for Arc<[u8]> { +impl PersistentGpuBufferable for Arc<[MeshletCullData]> { type Metadata = (); fn size_in_bytes(&self) -> usize { - self.len() + self.len() * size_of::() } - fn write_bytes_le(&self, _: Self::Metadata, buffer_slice: &mut [u8]) { - buffer_slice.clone_from_slice(self); - } -} - -impl PersistentGpuBufferable for Arc<[u32]> { - type Metadata = (); - - fn size_in_bytes(&self) -> usize { - self.len() * size_of::() - } - - fn write_bytes_le(&self, _: Self::Metadata, buffer_slice: &mut [u8]) { + fn write_bytes_le(&self, _: Self::Metadata, buffer_slice: &mut [u8], _: BufferAddress) { buffer_slice.clone_from_slice(bytemuck::cast_slice(self)); } } -impl PersistentGpuBufferable for Arc<[Vec2]> { +impl PersistentGpuBufferable for Arc<[u8]> { type Metadata = (); fn size_in_bytes(&self) -> usize { - self.len() * size_of::() + self.len() } - fn write_bytes_le(&self, _: Self::Metadata, buffer_slice: &mut [u8]) { - buffer_slice.clone_from_slice(bytemuck::cast_slice(self)); + fn write_bytes_le(&self, _: Self::Metadata, buffer_slice: &mut [u8], _: BufferAddress) { + buffer_slice.clone_from_slice(self); } } -impl PersistentGpuBufferable for Arc<[MeshletBoundingSpheres]> { +impl PersistentGpuBufferable for Arc<[u32]> { type Metadata = (); fn size_in_bytes(&self) -> usize { - self.len() * size_of::() + self.len() * size_of::() } - fn write_bytes_le(&self, _: Self::Metadata, buffer_slice: &mut [u8]) { + fn write_bytes_le(&self, _: Self::Metadata, buffer_slice: &mut [u8], _: BufferAddress) { buffer_slice.clone_from_slice(bytemuck::cast_slice(self)); } } -impl PersistentGpuBufferable for Arc<[MeshletSimplificationError]> { +impl PersistentGpuBufferable for Arc<[Vec2]> { type Metadata = (); fn size_in_bytes(&self) -> usize { - self.len() * size_of::() + self.len() * size_of::() } - fn write_bytes_le(&self, _: Self::Metadata, buffer_slice: &mut [u8]) { + fn write_bytes_le(&self, _: Self::Metadata, buffer_slice: &mut [u8], _: BufferAddress) { buffer_slice.clone_from_slice(bytemuck::cast_slice(self)); } } diff --git a/crates/bevy_pbr/src/meshlet/pipelines.rs b/crates/bevy_pbr/src/meshlet/pipelines.rs index c25d896b8a385..c1c50eb4c84a8 100644 --- a/crates/bevy_pbr/src/meshlet/pipelines.rs +++ b/crates/bevy_pbr/src/meshlet/pipelines.rs @@ -1,5 +1,5 @@ use super::resource_manager::ResourceManager; -use bevy_asset::{weak_handle, Handle}; +use bevy_asset::{load_embedded_asset, Handle}; use bevy_core_pipeline::{ core_3d::CORE_3D_DEPTH_FORMAT, experimental::mip_generation::DOWNSAMPLE_DEPTH_SHADER_HANDLE, fullscreen_vertex_shader::fullscreen_shader_vertex_state, @@ -10,28 +10,16 @@ use bevy_ecs::{ }; use bevy_render::render_resource::*; -pub const MESHLET_CLEAR_VISIBILITY_BUFFER_SHADER_HANDLE: Handle = - weak_handle!("a4bf48e4-5605-4d1c-987e-29c7b1ec95dc"); -pub const MESHLET_FILL_CLUSTER_BUFFERS_SHADER_HANDLE: Handle = - weak_handle!("80ccea4a-8234-4ee0-af74-77b3cad503cf"); -pub const MESHLET_CULLING_SHADER_HANDLE: Handle = - weak_handle!("d71c5879-97fa-49d1-943e-ed9162fe8adb"); -pub const MESHLET_VISIBILITY_BUFFER_SOFTWARE_RASTER_SHADER_HANDLE: Handle = - weak_handle!("68cc6826-8321-43d1-93d5-4f61f0456c13"); -pub const MESHLET_VISIBILITY_BUFFER_HARDWARE_RASTER_SHADER_HANDLE: Handle = - weak_handle!("4b4e3020-748f-4baf-b011-87d9d2a12796"); -pub const MESHLET_RESOLVE_RENDER_TARGETS_SHADER_HANDLE: Handle = - weak_handle!("c218ce17-cf59-4268-8898-13ecf384f133"); -pub const MESHLET_REMAP_1D_TO_2D_DISPATCH_SHADER_HANDLE: Handle = - weak_handle!("f5b7edfc-2eac-4407-8f5c-1265d4d795c2"); - #[derive(Resource)] pub struct MeshletPipelines { - fill_cluster_buffers: CachedComputePipelineId, clear_visibility_buffer: CachedComputePipelineId, clear_visibility_buffer_shadow_view: CachedComputePipelineId, - cull_first: CachedComputePipelineId, - cull_second: CachedComputePipelineId, + first_instance_cull: CachedComputePipelineId, + second_instance_cull: CachedComputePipelineId, + first_bvh_cull: CachedComputePipelineId, + second_bvh_cull: CachedComputePipelineId, + first_meshlet_cull: CachedComputePipelineId, + second_meshlet_cull: CachedComputePipelineId, downsample_depth_first: CachedComputePipelineId, downsample_depth_second: CachedComputePipelineId, downsample_depth_first_shadow_view: CachedComputePipelineId, @@ -45,21 +33,35 @@ pub struct MeshletPipelines { resolve_depth_shadow_view: CachedRenderPipelineId, resolve_material_depth: CachedRenderPipelineId, remap_1d_to_2d_dispatch: Option, + fill_counts: CachedComputePipelineId, + pub(crate) meshlet_mesh_material: Handle, } impl FromWorld for MeshletPipelines { fn from_world(world: &mut World) -> Self { let resource_manager = world.resource::(); - let fill_cluster_buffers_bind_group_layout = resource_manager - .fill_cluster_buffers_bind_group_layout - .clone(); let clear_visibility_buffer_bind_group_layout = resource_manager .clear_visibility_buffer_bind_group_layout .clone(); let clear_visibility_buffer_shadow_view_bind_group_layout = resource_manager .clear_visibility_buffer_shadow_view_bind_group_layout .clone(); - let cull_layout = resource_manager.culling_bind_group_layout.clone(); + let first_instance_cull_bind_group_layout = resource_manager + .first_instance_cull_bind_group_layout + .clone(); + let second_instance_cull_bind_group_layout = resource_manager + .second_instance_cull_bind_group_layout + .clone(); + let first_bvh_cull_bind_group_layout = + resource_manager.first_bvh_cull_bind_group_layout.clone(); + let second_bvh_cull_bind_group_layout = + resource_manager.second_bvh_cull_bind_group_layout.clone(); + let first_meshlet_cull_bind_group_layout = resource_manager + .first_meshlet_cull_bind_group_layout + .clone(); + let second_meshlet_cull_bind_group_layout = resource_manager + .second_meshlet_cull_bind_group_layout + .clone(); let downsample_depth_layout = resource_manager.downsample_depth_bind_group_layout.clone(); let downsample_depth_shadow_view_layout = resource_manager .downsample_depth_shadow_view_bind_group_layout @@ -80,24 +82,24 @@ impl FromWorld for MeshletPipelines { let remap_1d_to_2d_dispatch_layout = resource_manager .remap_1d_to_2d_dispatch_bind_group_layout .clone(); + let fill_counts_layout = resource_manager.fill_counts_bind_group_layout.clone(); + + let clear_visibility_buffer = load_embedded_asset!(world, "clear_visibility_buffer.wgsl"); + let cull_instances = load_embedded_asset!(world, "cull_instances.wgsl"); + let cull_bvh = load_embedded_asset!(world, "cull_bvh.wgsl"); + let cull_clusters = load_embedded_asset!(world, "cull_clusters.wgsl"); + let visibility_buffer_software_raster = + load_embedded_asset!(world, "visibility_buffer_software_raster.wgsl"); + let visibility_buffer_hardware_raster = + load_embedded_asset!(world, "visibility_buffer_hardware_raster.wgsl"); + let resolve_render_targets = load_embedded_asset!(world, "resolve_render_targets.wgsl"); + let remap_1d_to_2d_dispatch = load_embedded_asset!(world, "remap_1d_to_2d_dispatch.wgsl"); + let fill_counts = load_embedded_asset!(world, "fill_counts.wgsl"); + let meshlet_mesh_material = load_embedded_asset!(world, "meshlet_mesh_material.wgsl"); + let pipeline_cache = world.resource_mut::(); Self { - fill_cluster_buffers: pipeline_cache.queue_compute_pipeline( - ComputePipelineDescriptor { - label: Some("meshlet_fill_cluster_buffers_pipeline".into()), - layout: vec![fill_cluster_buffers_bind_group_layout], - push_constant_ranges: vec![PushConstantRange { - stages: ShaderStages::COMPUTE, - range: 0..4, - }], - shader: MESHLET_FILL_CLUSTER_BUFFERS_SHADER_HANDLE, - shader_defs: vec!["MESHLET_FILL_CLUSTER_BUFFERS_PASS".into()], - entry_point: "fill_cluster_buffers".into(), - zero_initialize_workgroup_memory: false, - }, - ), - clear_visibility_buffer: pipeline_cache.queue_compute_pipeline( ComputePipelineDescriptor { label: Some("meshlet_clear_visibility_buffer_pipeline".into()), @@ -106,7 +108,7 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_CLEAR_VISIBILITY_BUFFER_SHADER_HANDLE, + shader: clear_visibility_buffer.clone(), shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into()], entry_point: "clear_visibility_buffer".into(), zero_initialize_workgroup_memory: false, @@ -121,39 +123,105 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_CLEAR_VISIBILITY_BUFFER_SHADER_HANDLE, + shader: clear_visibility_buffer, shader_defs: vec![], entry_point: "clear_visibility_buffer".into(), zero_initialize_workgroup_memory: false, }, ), - cull_first: pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { - label: Some("meshlet_culling_first_pipeline".into()), - layout: vec![cull_layout.clone()], + first_instance_cull: pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { + label: Some("meshlet_first_instance_cull_pipeline".into()), + layout: vec![first_instance_cull_bind_group_layout.clone()], + push_constant_ranges: vec![PushConstantRange { + stages: ShaderStages::COMPUTE, + range: 0..4, + }], + shader: cull_instances.clone(), + shader_defs: vec![ + "MESHLET_INSTANCE_CULLING_PASS".into(), + "MESHLET_FIRST_CULLING_PASS".into(), + ], + entry_point: "cull_instances".into(), + zero_initialize_workgroup_memory: false, + }), + + second_instance_cull: pipeline_cache.queue_compute_pipeline( + ComputePipelineDescriptor { + label: Some("meshlet_second_instance_cull_pipeline".into()), + layout: vec![second_instance_cull_bind_group_layout.clone()], + push_constant_ranges: vec![PushConstantRange { + stages: ShaderStages::COMPUTE, + range: 0..4, + }], + shader: cull_instances, + shader_defs: vec![ + "MESHLET_INSTANCE_CULLING_PASS".into(), + "MESHLET_SECOND_CULLING_PASS".into(), + ], + entry_point: "cull_instances".into(), + zero_initialize_workgroup_memory: false, + }, + ), + + first_bvh_cull: pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { + label: Some("meshlet_first_bvh_cull_pipeline".into()), + layout: vec![first_bvh_cull_bind_group_layout.clone()], push_constant_ranges: vec![PushConstantRange { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_CULLING_SHADER_HANDLE, + shader: cull_bvh.clone(), shader_defs: vec![ - "MESHLET_CULLING_PASS".into(), + "MESHLET_BVH_CULLING_PASS".into(), "MESHLET_FIRST_CULLING_PASS".into(), ], - entry_point: "cull_clusters".into(), + entry_point: "cull_bvh".into(), zero_initialize_workgroup_memory: false, }), - cull_second: pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { - label: Some("meshlet_culling_second_pipeline".into()), - layout: vec![cull_layout], + second_bvh_cull: pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { + label: Some("meshlet_second_bvh_cull_pipeline".into()), + layout: vec![second_bvh_cull_bind_group_layout.clone()], push_constant_ranges: vec![PushConstantRange { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_CULLING_SHADER_HANDLE, + shader: cull_bvh, shader_defs: vec![ - "MESHLET_CULLING_PASS".into(), + "MESHLET_BVH_CULLING_PASS".into(), + "MESHLET_SECOND_CULLING_PASS".into(), + ], + entry_point: "cull_bvh".into(), + zero_initialize_workgroup_memory: false, + }), + + first_meshlet_cull: pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { + label: Some("meshlet_first_meshlet_cull_pipeline".into()), + layout: vec![first_meshlet_cull_bind_group_layout.clone()], + push_constant_ranges: vec![PushConstantRange { + stages: ShaderStages::COMPUTE, + range: 0..4, + }], + shader: cull_clusters.clone(), + shader_defs: vec![ + "MESHLET_CLUSTER_CULLING_PASS".into(), + "MESHLET_FIRST_CULLING_PASS".into(), + ], + entry_point: "cull_clusters".into(), + zero_initialize_workgroup_memory: false, + }), + + second_meshlet_cull: pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { + label: Some("meshlet_second_meshlet_cull_pipeline".into()), + layout: vec![second_meshlet_cull_bind_group_layout.clone()], + push_constant_ranges: vec![PushConstantRange { + stages: ShaderStages::COMPUTE, + range: 0..4, + }], + shader: cull_clusters, + shader_defs: vec![ + "MESHLET_CLUSTER_CULLING_PASS".into(), "MESHLET_SECOND_CULLING_PASS".into(), ], entry_point: "cull_clusters".into(), @@ -231,7 +299,7 @@ impl FromWorld for MeshletPipelines { label: Some("meshlet_visibility_buffer_software_raster_pipeline".into()), layout: vec![visibility_buffer_raster_layout.clone()], push_constant_ranges: vec![], - shader: MESHLET_VISIBILITY_BUFFER_SOFTWARE_RASTER_SHADER_HANDLE, + shader: visibility_buffer_software_raster.clone(), shader_defs: vec![ "MESHLET_VISIBILITY_BUFFER_RASTER_PASS".into(), "MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into(), @@ -254,7 +322,7 @@ impl FromWorld for MeshletPipelines { ), layout: vec![visibility_buffer_raster_shadow_view_layout.clone()], push_constant_ranges: vec![], - shader: MESHLET_VISIBILITY_BUFFER_SOFTWARE_RASTER_SHADER_HANDLE, + shader: visibility_buffer_software_raster, shader_defs: vec![ "MESHLET_VISIBILITY_BUFFER_RASTER_PASS".into(), if remap_1d_to_2d_dispatch_layout.is_some() { @@ -278,7 +346,7 @@ impl FromWorld for MeshletPipelines { range: 0..4, }], vertex: VertexState { - shader: MESHLET_VISIBILITY_BUFFER_HARDWARE_RASTER_SHADER_HANDLE, + shader: visibility_buffer_hardware_raster.clone(), shader_defs: vec![ "MESHLET_VISIBILITY_BUFFER_RASTER_PASS".into(), "MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into(), @@ -298,7 +366,7 @@ impl FromWorld for MeshletPipelines { depth_stencil: None, multisample: MultisampleState::default(), fragment: Some(FragmentState { - shader: MESHLET_VISIBILITY_BUFFER_HARDWARE_RASTER_SHADER_HANDLE, + shader: visibility_buffer_hardware_raster.clone(), shader_defs: vec![ "MESHLET_VISIBILITY_BUFFER_RASTER_PASS".into(), "MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into(), @@ -325,7 +393,7 @@ impl FromWorld for MeshletPipelines { range: 0..4, }], vertex: VertexState { - shader: MESHLET_VISIBILITY_BUFFER_HARDWARE_RASTER_SHADER_HANDLE, + shader: visibility_buffer_hardware_raster.clone(), shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS".into()], entry_point: "vertex".into(), buffers: vec![], @@ -342,7 +410,7 @@ impl FromWorld for MeshletPipelines { depth_stencil: None, multisample: MultisampleState::default(), fragment: Some(FragmentState { - shader: MESHLET_VISIBILITY_BUFFER_HARDWARE_RASTER_SHADER_HANDLE, + shader: visibility_buffer_hardware_raster.clone(), shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS".into()], entry_point: "fragment".into(), targets: vec![Some(ColorTargetState { @@ -367,7 +435,7 @@ impl FromWorld for MeshletPipelines { range: 0..4, }], vertex: VertexState { - shader: MESHLET_VISIBILITY_BUFFER_HARDWARE_RASTER_SHADER_HANDLE, + shader: visibility_buffer_hardware_raster.clone(), shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS".into()], entry_point: "vertex".into(), buffers: vec![], @@ -384,7 +452,7 @@ impl FromWorld for MeshletPipelines { depth_stencil: None, multisample: MultisampleState::default(), fragment: Some(FragmentState { - shader: MESHLET_VISIBILITY_BUFFER_HARDWARE_RASTER_SHADER_HANDLE, + shader: visibility_buffer_hardware_raster, shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS".into()], entry_point: "fragment".into(), targets: vec![Some(ColorTargetState { @@ -411,7 +479,7 @@ impl FromWorld for MeshletPipelines { }), multisample: MultisampleState::default(), fragment: Some(FragmentState { - shader: MESHLET_RESOLVE_RENDER_TARGETS_SHADER_HANDLE, + shader: resolve_render_targets.clone(), shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into()], entry_point: "resolve_depth".into(), targets: vec![], @@ -435,7 +503,7 @@ impl FromWorld for MeshletPipelines { }), multisample: MultisampleState::default(), fragment: Some(FragmentState { - shader: MESHLET_RESOLVE_RENDER_TARGETS_SHADER_HANDLE, + shader: resolve_render_targets.clone(), shader_defs: vec![], entry_point: "resolve_depth".into(), targets: vec![], @@ -460,7 +528,7 @@ impl FromWorld for MeshletPipelines { }), multisample: MultisampleState::default(), fragment: Some(FragmentState { - shader: MESHLET_RESOLVE_RENDER_TARGETS_SHADER_HANDLE, + shader: resolve_render_targets, shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into()], entry_point: "resolve_material_depth".into(), targets: vec![], @@ -469,6 +537,21 @@ impl FromWorld for MeshletPipelines { }, ), + fill_counts: pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { + label: Some("meshlet_fill_counts_pipeline".into()), + layout: vec![fill_counts_layout], + push_constant_ranges: vec![], + shader: fill_counts, + shader_defs: vec![if remap_1d_to_2d_dispatch_layout.is_some() { + "MESHLET_2D_DISPATCH" + } else { + "" + } + .into()], + entry_point: "fill_counts".into(), + zero_initialize_workgroup_memory: false, + }), + remap_1d_to_2d_dispatch: remap_1d_to_2d_dispatch_layout.map(|layout| { pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { label: Some("meshlet_remap_1d_to_2d_dispatch_pipeline".into()), @@ -477,12 +560,14 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..4, }], - shader: MESHLET_REMAP_1D_TO_2D_DISPATCH_SHADER_HANDLE, + shader: remap_1d_to_2d_dispatch, shader_defs: vec![], entry_point: "remap_dispatch".into(), zero_initialize_workgroup_memory: false, }) }), + + meshlet_mesh_material, } } } @@ -502,6 +587,9 @@ impl MeshletPipelines { &ComputePipeline, &ComputePipeline, &ComputePipeline, + &ComputePipeline, + &ComputePipeline, + &ComputePipeline, &RenderPipeline, &RenderPipeline, &RenderPipeline, @@ -509,15 +597,19 @@ impl MeshletPipelines { &RenderPipeline, &RenderPipeline, Option<&ComputePipeline>, + &ComputePipeline, )> { let pipeline_cache = world.get_resource::()?; let pipeline = world.get_resource::()?; Some(( - pipeline_cache.get_compute_pipeline(pipeline.fill_cluster_buffers)?, pipeline_cache.get_compute_pipeline(pipeline.clear_visibility_buffer)?, pipeline_cache.get_compute_pipeline(pipeline.clear_visibility_buffer_shadow_view)?, - pipeline_cache.get_compute_pipeline(pipeline.cull_first)?, - pipeline_cache.get_compute_pipeline(pipeline.cull_second)?, + pipeline_cache.get_compute_pipeline(pipeline.first_instance_cull)?, + pipeline_cache.get_compute_pipeline(pipeline.second_instance_cull)?, + pipeline_cache.get_compute_pipeline(pipeline.first_bvh_cull)?, + pipeline_cache.get_compute_pipeline(pipeline.second_bvh_cull)?, + pipeline_cache.get_compute_pipeline(pipeline.first_meshlet_cull)?, + pipeline_cache.get_compute_pipeline(pipeline.second_meshlet_cull)?, pipeline_cache.get_compute_pipeline(pipeline.downsample_depth_first)?, pipeline_cache.get_compute_pipeline(pipeline.downsample_depth_second)?, pipeline_cache.get_compute_pipeline(pipeline.downsample_depth_first_shadow_view)?, @@ -538,6 +630,7 @@ impl MeshletPipelines { Some(id) => Some(pipeline_cache.get_compute_pipeline(id)?), None => None, }, + pipeline_cache.get_compute_pipeline(pipeline.fill_counts)?, )) } } diff --git a/crates/bevy_pbr/src/meshlet/remap_1d_to_2d_dispatch.wgsl b/crates/bevy_pbr/src/meshlet/remap_1d_to_2d_dispatch.wgsl index fc984436347bd..b9970c42b42eb 100644 --- a/crates/bevy_pbr/src/meshlet/remap_1d_to_2d_dispatch.wgsl +++ b/crates/bevy_pbr/src/meshlet/remap_1d_to_2d_dispatch.wgsl @@ -13,11 +13,12 @@ var max_compute_workgroups_per_dimension: u32; @compute @workgroup_size(1, 1, 1) fn remap_dispatch() { - meshlet_software_raster_cluster_count = meshlet_software_raster_indirect_args.x; + let cluster_count = meshlet_software_raster_indirect_args.x; - if meshlet_software_raster_cluster_count > max_compute_workgroups_per_dimension { - let n = u32(ceil(sqrt(f32(meshlet_software_raster_cluster_count)))); + if cluster_count > max_compute_workgroups_per_dimension { + let n = u32(ceil(sqrt(f32(cluster_count)))); meshlet_software_raster_indirect_args.x = n; meshlet_software_raster_indirect_args.y = n; + meshlet_software_raster_cluster_count = cluster_count; } } diff --git a/crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl b/crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl index eaa4eed6c4560..6fef0cc22717d 100644 --- a/crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl +++ b/crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl @@ -1,11 +1,12 @@ #import bevy_core_pipeline::fullscreen_vertex_shader::FullscreenVertexOutput +#import bevy_pbr::meshlet_bindings::InstancedOffset #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT @group(0) @binding(0) var meshlet_visibility_buffer: texture_storage_2d; #else @group(0) @binding(0) var meshlet_visibility_buffer: texture_storage_2d; #endif -@group(0) @binding(1) var meshlet_cluster_instance_ids: array; // Per cluster +@group(0) @binding(1) var meshlet_raster_clusters: array; // Per cluster @group(0) @binding(2) var meshlet_instance_material_ids: array; // Per entity instance /// This pass writes out the depth texture. @@ -33,7 +34,7 @@ fn resolve_material_depth(in: FullscreenVertexOutput) -> @builtin(frag_depth) f3 if depth == 0lu { discard; } let cluster_id = u32(visibility) >> 7u; - let instance_id = meshlet_cluster_instance_ids[cluster_id]; + let instance_id = meshlet_raster_clusters[cluster_id].instance_id; let material_id = meshlet_instance_material_ids[instance_id]; return f32(material_id) / 65535.0; } diff --git a/crates/bevy_pbr/src/meshlet/resource_manager.rs b/crates/bevy_pbr/src/meshlet/resource_manager.rs index 9b45d7676ab43..6c35eef69f4ee 100644 --- a/crates/bevy_pbr/src/meshlet/resource_manager.rs +++ b/crates/bevy_pbr/src/meshlet/resource_manager.rs @@ -1,6 +1,5 @@ use super::{instance_manager::InstanceManager, meshlet_mesh_manager::MeshletMeshManager}; use crate::ShadowView; -use alloc::sync::Arc; use bevy_core_pipeline::{ core_3d::Camera3d, experimental::mip_generation::{self, ViewDepthPyramid}, @@ -21,25 +20,26 @@ use bevy_render::{ view::{ExtractedView, RenderLayers, ViewUniform, ViewUniforms}, }; use binding_types::*; -use core::{iter, sync::atomic::AtomicBool}; -use encase::internal::WriteInto; +use core::iter; /// Manages per-view and per-cluster GPU resources for [`super::MeshletPlugin`]. #[derive(Resource)] pub struct ResourceManager { /// Intermediate buffer of cluster IDs for use with rasterizing the visibility buffer visibility_buffer_raster_clusters: Buffer, + /// Intermediate buffer of previous counts of clusters in rasterizer buckets + pub visibility_buffer_raster_cluster_prev_counts: Buffer, /// Intermediate buffer of count of clusters to software rasterize software_raster_cluster_count: Buffer, - /// Rightmost slot index of [`Self::visibility_buffer_raster_clusters`] - raster_cluster_rightmost_slot: u32, - - /// Per-cluster instance ID - cluster_instance_ids: Option, - /// Per-cluster meshlet ID - cluster_meshlet_ids: Option, - /// Per-cluster bitmask of whether or not it's a candidate for the second raster pass - second_pass_candidates_buffer: Option, + /// BVH traversal queues + bvh_traversal_queues: [Buffer; 2], + /// Cluster cull candidate queue + cluster_cull_candidate_queue: Buffer, + /// Rightmost slot index of [`Self::visibility_buffer_raster_clusters`], [`Self::bvh_traversal_queues`], and [`Self::cluster_cull_candidate_queue`] + cull_queue_rightmost_slot: u32, + + /// Second pass instance candidates + second_pass_candidates: Option, /// Sampler for a depth pyramid depth_pyramid_sampler: Sampler, /// Dummy texture view for binding depth pyramids with less than the maximum amount of mips @@ -49,10 +49,14 @@ pub struct ResourceManager { previous_depth_pyramids: EntityHashMap, // Bind group layouts - pub fill_cluster_buffers_bind_group_layout: BindGroupLayout, pub clear_visibility_buffer_bind_group_layout: BindGroupLayout, pub clear_visibility_buffer_shadow_view_bind_group_layout: BindGroupLayout, - pub culling_bind_group_layout: BindGroupLayout, + pub first_instance_cull_bind_group_layout: BindGroupLayout, + pub second_instance_cull_bind_group_layout: BindGroupLayout, + pub first_bvh_cull_bind_group_layout: BindGroupLayout, + pub second_bvh_cull_bind_group_layout: BindGroupLayout, + pub first_meshlet_cull_bind_group_layout: BindGroupLayout, + pub second_meshlet_cull_bind_group_layout: BindGroupLayout, pub visibility_buffer_raster_bind_group_layout: BindGroupLayout, pub visibility_buffer_raster_shadow_view_bind_group_layout: BindGroupLayout, pub downsample_depth_bind_group_layout: BindGroupLayout, @@ -61,6 +65,7 @@ pub struct ResourceManager { pub resolve_depth_shadow_view_bind_group_layout: BindGroupLayout, pub resolve_material_depth_bind_group_layout: BindGroupLayout, pub material_shade_bind_group_layout: BindGroupLayout, + pub fill_counts_bind_group_layout: BindGroupLayout, pub remap_1d_to_2d_dispatch_bind_group_layout: Option, } @@ -68,25 +73,52 @@ impl ResourceManager { pub fn new(cluster_buffer_slots: u32, render_device: &RenderDevice) -> Self { let needs_dispatch_remap = cluster_buffer_slots > render_device.limits().max_compute_workgroups_per_dimension; + let cull_queue_size = 2 * cluster_buffer_slots as u64 * size_of::() as u64; Self { visibility_buffer_raster_clusters: render_device.create_buffer(&BufferDescriptor { label: Some("meshlet_visibility_buffer_raster_clusters"), - size: cluster_buffer_slots as u64 * size_of::() as u64, + size: cull_queue_size, usage: BufferUsages::STORAGE, mapped_at_creation: false, }), + visibility_buffer_raster_cluster_prev_counts: render_device.create_buffer( + &BufferDescriptor { + label: Some("meshlet_visibility_buffer_raster_cluster_prev_counts"), + size: size_of::() as u64 * 2, + usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, + mapped_at_creation: false, + }, + ), software_raster_cluster_count: render_device.create_buffer(&BufferDescriptor { label: Some("meshlet_software_raster_cluster_count"), size: size_of::() as u64, usage: BufferUsages::STORAGE, mapped_at_creation: false, }), - raster_cluster_rightmost_slot: cluster_buffer_slots - 1, + bvh_traversal_queues: [ + render_device.create_buffer(&BufferDescriptor { + label: Some("meshlet_bvh_traversal_queue_0"), + size: cull_queue_size, + usage: BufferUsages::STORAGE, + mapped_at_creation: false, + }), + render_device.create_buffer(&BufferDescriptor { + label: Some("meshlet_bvh_traversal_queue_1"), + size: cull_queue_size, + usage: BufferUsages::STORAGE, + mapped_at_creation: false, + }), + ], + cluster_cull_candidate_queue: render_device.create_buffer(&BufferDescriptor { + label: Some("meshlet_cluster_cull_candidate_queue"), + size: cull_queue_size, + usage: BufferUsages::STORAGE, + mapped_at_creation: false, + }), + cull_queue_rightmost_slot: cluster_buffer_slots - 1, - cluster_instance_ids: None, - cluster_meshlet_ids: None, - second_pass_candidates_buffer: None, + second_pass_candidates: None, depth_pyramid_sampler: render_device.create_sampler(&SamplerDescriptor { label: Some("meshlet_depth_pyramid_sampler"), ..SamplerDescriptor::default() @@ -100,52 +132,146 @@ impl ResourceManager { previous_depth_pyramids: EntityHashMap::default(), // TODO: Buffer min sizes - fill_cluster_buffers_bind_group_layout: render_device.create_bind_group_layout( - "meshlet_fill_cluster_buffers_bind_group_layout", + clear_visibility_buffer_bind_group_layout: render_device.create_bind_group_layout( + "meshlet_clear_visibility_buffer_bind_group_layout", + &BindGroupLayoutEntries::single( + ShaderStages::COMPUTE, + texture_storage_2d(TextureFormat::R64Uint, StorageTextureAccess::WriteOnly), + ), + ), + clear_visibility_buffer_shadow_view_bind_group_layout: render_device + .create_bind_group_layout( + "meshlet_clear_visibility_buffer_shadow_view_bind_group_layout", + &BindGroupLayoutEntries::single( + ShaderStages::COMPUTE, + texture_storage_2d(TextureFormat::R32Uint, StorageTextureAccess::WriteOnly), + ), + ), + first_instance_cull_bind_group_layout: render_device.create_bind_group_layout( + "meshlet_first_instance_culling_bind_group_layout", &BindGroupLayoutEntries::sequential( ShaderStages::COMPUTE, ( + texture_2d(TextureSampleType::Float { filterable: false }), + uniform_buffer::(true), + uniform_buffer::(true), + storage_buffer_read_only_sized(false, None), + storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), storage_buffer_sized(false, None), storage_buffer_sized(false, None), storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), ), ), ), - clear_visibility_buffer_bind_group_layout: render_device.create_bind_group_layout( - "meshlet_clear_visibility_buffer_bind_group_layout", - &BindGroupLayoutEntries::single( + second_instance_cull_bind_group_layout: render_device.create_bind_group_layout( + "meshlet_second_instance_culling_bind_group_layout", + &BindGroupLayoutEntries::sequential( ShaderStages::COMPUTE, - texture_storage_2d(TextureFormat::R64Uint, StorageTextureAccess::WriteOnly), + ( + texture_2d(TextureSampleType::Float { filterable: false }), + uniform_buffer::(true), + uniform_buffer::(true), + storage_buffer_read_only_sized(false, None), + storage_buffer_read_only_sized(false, None), + storage_buffer_read_only_sized(false, None), + storage_buffer_read_only_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_read_only_sized(false, None), + storage_buffer_read_only_sized(false, None), + ), ), ), - clear_visibility_buffer_shadow_view_bind_group_layout: render_device - .create_bind_group_layout( - "meshlet_clear_visibility_buffer_shadow_view_bind_group_layout", - &BindGroupLayoutEntries::single( - ShaderStages::COMPUTE, - texture_storage_2d(TextureFormat::R32Uint, StorageTextureAccess::WriteOnly), + first_bvh_cull_bind_group_layout: render_device.create_bind_group_layout( + "meshlet_first_bvh_culling_bind_group_layout", + &BindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + ( + texture_2d(TextureSampleType::Float { filterable: false }), + uniform_buffer::(true), + uniform_buffer::(true), + storage_buffer_read_only_sized(false, None), + storage_buffer_read_only_sized(false, None), + storage_buffer_read_only_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), ), ), - culling_bind_group_layout: render_device.create_bind_group_layout( - "meshlet_culling_bind_group_layout", + ), + second_bvh_cull_bind_group_layout: render_device.create_bind_group_layout( + "meshlet_second_bvh_culling_bind_group_layout", &BindGroupLayoutEntries::sequential( ShaderStages::COMPUTE, ( + texture_2d(TextureSampleType::Float { filterable: false }), + uniform_buffer::(true), + uniform_buffer::(true), storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + ), + ), + ), + first_meshlet_cull_bind_group_layout: render_device.create_bind_group_layout( + "meshlet_first_meshlet_culling_bind_group_layout", + &BindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + ( + texture_2d(TextureSampleType::Float { filterable: false }), + uniform_buffer::(true), + uniform_buffer::(true), storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), storage_buffer_read_only_sized(false, None), storage_buffer_sized(false, None), + storage_buffer_read_only_sized(false, None), storage_buffer_sized(false, None), storage_buffer_sized(false, None), storage_buffer_sized(false, None), + ), + ), + ), + second_meshlet_cull_bind_group_layout: render_device.create_bind_group_layout( + "meshlet_second_meshlet_culling_bind_group_layout", + &BindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + ( texture_2d(TextureSampleType::Float { filterable: false }), uniform_buffer::(true), uniform_buffer::(true), + storage_buffer_read_only_sized(false, None), + storage_buffer_read_only_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_read_only_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_read_only_sized(false, None), + storage_buffer_read_only_sized(false, None), ), ), ), @@ -215,7 +341,6 @@ impl ResourceManager { storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), - storage_buffer_read_only_sized(false, None), texture_storage_2d(TextureFormat::R64Uint, StorageTextureAccess::Atomic), uniform_buffer::(true), ), @@ -234,7 +359,6 @@ impl ResourceManager { storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), - storage_buffer_read_only_sized(false, None), texture_storage_2d( TextureFormat::R32Uint, StorageTextureAccess::Atomic, @@ -281,10 +405,35 @@ impl ResourceManager { storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None), - storage_buffer_read_only_sized(false, None), ), ), ), + fill_counts_bind_group_layout: if needs_dispatch_remap { + render_device.create_bind_group_layout( + "meshlet_fill_counts_bind_group_layout", + &BindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + ( + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + ), + ), + ) + } else { + render_device.create_bind_group_layout( + "meshlet_fill_counts_bind_group_layout", + &BindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + ( + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + storage_buffer_sized(false, None), + ), + ), + ) + }, remap_1d_to_2d_dispatch_bind_group_layout: needs_dispatch_remap.then(|| { render_device.create_bind_group_layout( "meshlet_remap_1d_to_2d_dispatch_bind_group_layout", @@ -306,57 +455,56 @@ impl ResourceManager { #[derive(Component)] pub struct MeshletViewResources { pub scene_instance_count: u32, - pub scene_cluster_count: u32, - pub second_pass_candidates_buffer: Buffer, + pub rightmost_slot: u32, + pub max_bvh_depth: u32, instance_visibility: Buffer, pub dummy_render_target: CachedTexture, pub visibility_buffer: CachedTexture, - pub visibility_buffer_software_raster_indirect_args_first: Buffer, - pub visibility_buffer_software_raster_indirect_args_second: Buffer, - pub visibility_buffer_hardware_raster_indirect_args_first: Buffer, - pub visibility_buffer_hardware_raster_indirect_args_second: Buffer, + pub second_pass_count: Buffer, + pub second_pass_dispatch: Buffer, + pub second_pass_candidates: Buffer, + pub first_bvh_cull_count_front: Buffer, + pub first_bvh_cull_dispatch_front: Buffer, + pub first_bvh_cull_count_back: Buffer, + pub first_bvh_cull_dispatch_back: Buffer, + pub first_bvh_cull_queue: Buffer, + pub second_bvh_cull_count_front: Buffer, + pub second_bvh_cull_dispatch_front: Buffer, + pub second_bvh_cull_count_back: Buffer, + pub second_bvh_cull_dispatch_back: Buffer, + pub second_bvh_cull_queue: Buffer, + pub front_meshlet_cull_count: Buffer, + pub front_meshlet_cull_dispatch: Buffer, + pub back_meshlet_cull_count: Buffer, + pub back_meshlet_cull_dispatch: Buffer, + pub meshlet_cull_queue: Buffer, + pub visibility_buffer_software_raster_indirect_args: Buffer, + pub visibility_buffer_hardware_raster_indirect_args: Buffer, pub depth_pyramid: ViewDepthPyramid, previous_depth_pyramid: TextureView, pub material_depth: Option, pub view_size: UVec2, - pub raster_cluster_rightmost_slot: u32, not_shadow_view: bool, } #[derive(Component)] pub struct MeshletViewBindGroups { - pub first_node: Arc, - pub fill_cluster_buffers: BindGroup, pub clear_visibility_buffer: BindGroup, - pub culling_first: BindGroup, - pub culling_second: BindGroup, + pub first_instance_cull: BindGroup, + pub second_instance_cull: BindGroup, + pub first_bvh_cull_ping: BindGroup, + pub first_bvh_cull_pong: BindGroup, + pub second_bvh_cull_ping: BindGroup, + pub second_bvh_cull_pong: BindGroup, + pub first_meshlet_cull: BindGroup, + pub second_meshlet_cull: BindGroup, pub downsample_depth: BindGroup, pub visibility_buffer_raster: BindGroup, pub resolve_depth: BindGroup, pub resolve_material_depth: Option, pub material_shade: Option, - pub remap_1d_to_2d_dispatch: Option<(BindGroup, BindGroup)>, -} - -// TODO: Try using Queue::write_buffer_with() in queue_meshlet_mesh_upload() to reduce copies -fn upload_storage_buffer( - buffer: &mut StorageBuffer>, - render_device: &RenderDevice, - render_queue: &RenderQueue, -) where - Vec: WriteInto, -{ - let inner = buffer.buffer(); - let capacity = inner.map_or(0, |b| b.size()); - let size = buffer.get().size().get() as BufferAddress; - - if capacity >= size { - let inner = inner.unwrap(); - let bytes = bytemuck::must_cast_slice(buffer.get().as_slice()); - render_queue.write_buffer(inner, 0, bytes); - } else { - buffer.write_buffer(render_device, render_queue); - } + pub remap_1d_to_2d_dispatch: Option, + pub fill_counts: BindGroup, } // TODO: Cache things per-view and skip running this system / optimize this system @@ -374,7 +522,7 @@ pub fn prepare_meshlet_per_frame_resources( render_device: Res, mut commands: Commands, ) { - if instance_manager.scene_cluster_count == 0 { + if instance_manager.scene_instance_count == 0 { return; } @@ -384,41 +532,22 @@ pub fn prepare_meshlet_per_frame_resources( instance_manager .instance_uniforms .write_buffer(&render_device, &render_queue); - upload_storage_buffer( - &mut instance_manager.instance_material_ids, - &render_device, - &render_queue, - ); - upload_storage_buffer( - &mut instance_manager.instance_meshlet_counts, - &render_device, - &render_queue, - ); - upload_storage_buffer( - &mut instance_manager.instance_meshlet_slice_starts, - &render_device, - &render_queue, - ); - - let needed_buffer_size = 4 * instance_manager.scene_cluster_count as u64; - match &mut resource_manager.cluster_instance_ids { - Some(buffer) if buffer.size() >= needed_buffer_size => buffer.clone(), - slot => { - let buffer = render_device.create_buffer(&BufferDescriptor { - label: Some("meshlet_cluster_instance_ids"), - size: needed_buffer_size, - usage: BufferUsages::STORAGE, - mapped_at_creation: false, - }); - *slot = Some(buffer.clone()); - buffer - } - }; - match &mut resource_manager.cluster_meshlet_ids { + instance_manager + .instance_aabbs + .write_buffer(&render_device, &render_queue); + instance_manager + .instance_material_ids + .write_buffer(&render_device, &render_queue); + instance_manager + .instance_bvh_root_nodes + .write_buffer(&render_device, &render_queue); + + let needed_buffer_size = 4 * instance_manager.scene_instance_count as u64; + let second_pass_candidates = match &mut resource_manager.second_pass_candidates { Some(buffer) if buffer.size() >= needed_buffer_size => buffer.clone(), slot => { let buffer = render_device.create_buffer(&BufferDescriptor { - label: Some("meshlet_cluster_meshlet_ids"), + label: Some("meshlet_second_pass_candidates"), size: needed_buffer_size, usage: BufferUsages::STORAGE, mapped_at_creation: false, @@ -428,8 +557,6 @@ pub fn prepare_meshlet_per_frame_resources( } }; - let needed_buffer_size = - instance_manager.scene_cluster_count.div_ceil(u32::BITS) as u64 * size_of::() as u64; for (view_entity, view, render_layers, (_, shadow_view)) in &views { let not_shadow_view = shadow_view.is_none(); @@ -460,24 +587,9 @@ pub fn prepare_meshlet_per_frame_resources( vec[index] |= 1 << bit; } } - upload_storage_buffer(instance_visibility, &render_device, &render_queue); + instance_visibility.write_buffer(&render_device, &render_queue); let instance_visibility = instance_visibility.buffer().unwrap().clone(); - let second_pass_candidates_buffer = - match &mut resource_manager.second_pass_candidates_buffer { - Some(buffer) if buffer.size() >= needed_buffer_size => buffer.clone(), - slot => { - let buffer = render_device.create_buffer(&BufferDescriptor { - label: Some("meshlet_second_pass_candidates"), - size: needed_buffer_size, - usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, - mapped_at_creation: false, - }); - *slot = Some(buffer.clone()); - buffer - } - }; - // TODO: Remove this once wgpu allows render passes with no attachments let dummy_render_target = texture_cache.get( &render_device, @@ -519,34 +631,102 @@ pub fn prepare_meshlet_per_frame_resources( }, ); - let visibility_buffer_software_raster_indirect_args_first = render_device - .create_buffer_with_data(&BufferInitDescriptor { - label: Some("meshlet_visibility_buffer_software_raster_indirect_args_first"), + let second_pass_count = render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_second_pass_count"), + contents: bytemuck::bytes_of(&0u32), + usage: BufferUsages::STORAGE, + }); + let second_pass_dispatch = render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_second_pass_dispatch"), + contents: DispatchIndirectArgs { x: 0, y: 1, z: 1 }.as_bytes(), + usage: BufferUsages::STORAGE | BufferUsages::INDIRECT, + }); + + let first_bvh_cull_count_front = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_first_bvh_cull_count_front"), + contents: bytemuck::bytes_of(&0u32), + usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, + }); + let first_bvh_cull_dispatch_front = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_first_bvh_cull_dispatch_front"), + contents: DispatchIndirectArgs { x: 0, y: 1, z: 1 }.as_bytes(), + usage: BufferUsages::STORAGE | BufferUsages::INDIRECT | BufferUsages::COPY_DST, + }); + let first_bvh_cull_count_back = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_first_bvh_cull_count_back"), + contents: bytemuck::bytes_of(&0u32), + usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, + }); + let first_bvh_cull_dispatch_back = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_first_bvh_cull_dispatch_back"), + contents: DispatchIndirectArgs { x: 0, y: 1, z: 1 }.as_bytes(), + usage: BufferUsages::STORAGE | BufferUsages::INDIRECT | BufferUsages::COPY_DST, + }); + + let second_bvh_cull_count_front = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_second_bvh_cull_count_front"), + contents: bytemuck::bytes_of(&0u32), + usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, + }); + let second_bvh_cull_dispatch_front = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_second_bvh_cull_dispatch_front"), + contents: DispatchIndirectArgs { x: 0, y: 1, z: 1 }.as_bytes(), + usage: BufferUsages::STORAGE | BufferUsages::INDIRECT | BufferUsages::COPY_DST, + }); + let second_bvh_cull_count_back = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_second_bvh_cull_count_back"), + contents: bytemuck::bytes_of(&0u32), + usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, + }); + let second_bvh_cull_dispatch_back = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_second_bvh_cull_dispatch_back"), + contents: DispatchIndirectArgs { x: 0, y: 1, z: 1 }.as_bytes(), + usage: BufferUsages::STORAGE | BufferUsages::INDIRECT | BufferUsages::COPY_DST, + }); + + let front_meshlet_cull_count = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_front_meshlet_cull_count"), + contents: bytemuck::bytes_of(&0u32), + usage: BufferUsages::STORAGE, + }); + let front_meshlet_cull_dispatch = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_front_meshlet_cull_dispatch"), contents: DispatchIndirectArgs { x: 0, y: 1, z: 1 }.as_bytes(), usage: BufferUsages::STORAGE | BufferUsages::INDIRECT, }); - let visibility_buffer_software_raster_indirect_args_second = render_device - .create_buffer_with_data(&BufferInitDescriptor { - label: Some("visibility_buffer_software_raster_indirect_args_second"), + let back_meshlet_cull_count = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_back_meshlet_cull_count"), + contents: bytemuck::bytes_of(&0u32), + usage: BufferUsages::STORAGE, + }); + let back_meshlet_cull_dispatch = + render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("meshlet_back_meshlet_cull_dispatch"), contents: DispatchIndirectArgs { x: 0, y: 1, z: 1 }.as_bytes(), usage: BufferUsages::STORAGE | BufferUsages::INDIRECT, }); - let visibility_buffer_hardware_raster_indirect_args_first = render_device + let visibility_buffer_software_raster_indirect_args = render_device .create_buffer_with_data(&BufferInitDescriptor { - label: Some("meshlet_visibility_buffer_hardware_raster_indirect_args_first"), - contents: DrawIndirectArgs { - vertex_count: 128 * 3, - instance_count: 0, - first_vertex: 0, - first_instance: 0, - } - .as_bytes(), + label: Some("meshlet_visibility_buffer_software_raster_indirect_args"), + contents: DispatchIndirectArgs { x: 0, y: 1, z: 1 }.as_bytes(), usage: BufferUsages::STORAGE | BufferUsages::INDIRECT, }); - let visibility_buffer_hardware_raster_indirect_args_second = render_device + + let visibility_buffer_hardware_raster_indirect_args = render_device .create_buffer_with_data(&BufferInitDescriptor { - label: Some("visibility_buffer_hardware_raster_indirect_args_second"), + label: Some("meshlet_visibility_buffer_hardware_raster_indirect_args"), contents: DrawIndirectArgs { vertex_count: 128 * 3, instance_count: 0, @@ -592,21 +772,36 @@ pub fn prepare_meshlet_per_frame_resources( commands.entity(view_entity).insert(MeshletViewResources { scene_instance_count: instance_manager.scene_instance_count, - scene_cluster_count: instance_manager.scene_cluster_count, - second_pass_candidates_buffer, + rightmost_slot: resource_manager.cull_queue_rightmost_slot, + max_bvh_depth: instance_manager.max_bvh_depth, instance_visibility, dummy_render_target, visibility_buffer, - visibility_buffer_software_raster_indirect_args_first, - visibility_buffer_software_raster_indirect_args_second, - visibility_buffer_hardware_raster_indirect_args_first, - visibility_buffer_hardware_raster_indirect_args_second, + second_pass_count, + second_pass_dispatch, + second_pass_candidates: second_pass_candidates.clone(), + first_bvh_cull_count_front, + first_bvh_cull_dispatch_front, + first_bvh_cull_count_back, + first_bvh_cull_dispatch_back, + first_bvh_cull_queue: resource_manager.bvh_traversal_queues[0].clone(), + second_bvh_cull_count_front, + second_bvh_cull_dispatch_front, + second_bvh_cull_count_back, + second_bvh_cull_dispatch_back, + second_bvh_cull_queue: resource_manager.bvh_traversal_queues[1].clone(), + front_meshlet_cull_count, + front_meshlet_cull_dispatch, + back_meshlet_cull_count, + back_meshlet_cull_dispatch, + meshlet_cull_queue: resource_manager.cluster_cull_candidate_queue.clone(), + visibility_buffer_software_raster_indirect_args, + visibility_buffer_hardware_raster_indirect_args, depth_pyramid, previous_depth_pyramid, material_depth: not_shadow_view .then(|| texture_cache.get(&render_device, material_depth)), view_size: view.viewport.zw(), - raster_cluster_rightmost_slot: resource_manager.raster_cluster_rightmost_slot, not_shadow_view, }); } @@ -622,49 +817,15 @@ pub fn prepare_meshlet_view_bind_groups( render_device: Res, mut commands: Commands, ) { - let ( - Some(cluster_instance_ids), - Some(cluster_meshlet_ids), - Some(view_uniforms), - Some(previous_view_uniforms), - ) = ( - resource_manager.cluster_instance_ids.as_ref(), - resource_manager.cluster_meshlet_ids.as_ref(), + let (Some(view_uniforms), Some(previous_view_uniforms)) = ( view_uniforms.uniforms.binding(), previous_view_uniforms.uniforms.binding(), - ) - else { + ) else { return; }; - let first_node = Arc::new(AtomicBool::new(true)); - - let fill_cluster_buffers_global_cluster_count = - render_device.create_buffer(&BufferDescriptor { - label: Some("meshlet_fill_cluster_buffers_global_cluster_count"), - size: 4, - usage: BufferUsages::STORAGE, - mapped_at_creation: false, - }); - // TODO: Some of these bind groups can be reused across multiple views for (view_entity, view_resources) in &views { - let entries = BindGroupEntries::sequential(( - instance_manager.instance_meshlet_counts.binding().unwrap(), - instance_manager - .instance_meshlet_slice_starts - .binding() - .unwrap(), - cluster_instance_ids.as_entire_binding(), - cluster_meshlet_ids.as_entire_binding(), - fill_cluster_buffers_global_cluster_count.as_entire_binding(), - )); - let fill_cluster_buffers = render_device.create_bind_group( - "meshlet_fill_cluster_buffers", - &resource_manager.fill_cluster_buffers_bind_group_layout, - &entries, - ); - let clear_visibility_buffer = render_device.create_bind_group( "meshlet_clear_visibility_buffer_bind_group", if view_resources.not_shadow_view { @@ -675,62 +836,241 @@ pub fn prepare_meshlet_view_bind_groups( &BindGroupEntries::single(&view_resources.visibility_buffer.default_view), ); - let entries = BindGroupEntries::sequential(( - cluster_meshlet_ids.as_entire_binding(), - meshlet_mesh_manager.meshlet_bounding_spheres.binding(), - meshlet_mesh_manager.meshlet_simplification_errors.binding(), - cluster_instance_ids.as_entire_binding(), - instance_manager.instance_uniforms.binding().unwrap(), - view_resources.instance_visibility.as_entire_binding(), - view_resources - .second_pass_candidates_buffer - .as_entire_binding(), - view_resources - .visibility_buffer_software_raster_indirect_args_first - .as_entire_binding(), - view_resources - .visibility_buffer_hardware_raster_indirect_args_first - .as_entire_binding(), - resource_manager - .visibility_buffer_raster_clusters - .as_entire_binding(), - &view_resources.previous_depth_pyramid, - view_uniforms.clone(), - previous_view_uniforms.clone(), - )); - let culling_first = render_device.create_bind_group( - "meshlet_culling_first_bind_group", - &resource_manager.culling_bind_group_layout, - &entries, + let first_instance_cull = render_device.create_bind_group( + "meshlet_first_instance_cull_bind_group", + &resource_manager.first_instance_cull_bind_group_layout, + &BindGroupEntries::sequential(( + &view_resources.previous_depth_pyramid, + view_uniforms.clone(), + previous_view_uniforms.clone(), + instance_manager.instance_uniforms.binding().unwrap(), + view_resources.instance_visibility.as_entire_binding(), + instance_manager.instance_aabbs.binding().unwrap(), + instance_manager.instance_bvh_root_nodes.binding().unwrap(), + view_resources + .first_bvh_cull_count_front + .as_entire_binding(), + view_resources + .first_bvh_cull_dispatch_front + .as_entire_binding(), + view_resources.first_bvh_cull_queue.as_entire_binding(), + view_resources.second_pass_count.as_entire_binding(), + view_resources.second_pass_dispatch.as_entire_binding(), + view_resources.second_pass_candidates.as_entire_binding(), + )), + ); + + let second_instance_cull = render_device.create_bind_group( + "meshlet_second_instance_cull_bind_group", + &resource_manager.second_instance_cull_bind_group_layout, + &BindGroupEntries::sequential(( + &view_resources.previous_depth_pyramid, + view_uniforms.clone(), + previous_view_uniforms.clone(), + instance_manager.instance_uniforms.binding().unwrap(), + view_resources.instance_visibility.as_entire_binding(), + instance_manager.instance_aabbs.binding().unwrap(), + instance_manager.instance_bvh_root_nodes.binding().unwrap(), + view_resources + .second_bvh_cull_count_front + .as_entire_binding(), + view_resources + .second_bvh_cull_dispatch_front + .as_entire_binding(), + view_resources.second_bvh_cull_queue.as_entire_binding(), + view_resources.second_pass_count.as_entire_binding(), + view_resources.second_pass_candidates.as_entire_binding(), + )), ); - let entries = BindGroupEntries::sequential(( - cluster_meshlet_ids.as_entire_binding(), - meshlet_mesh_manager.meshlet_bounding_spheres.binding(), - meshlet_mesh_manager.meshlet_simplification_errors.binding(), - cluster_instance_ids.as_entire_binding(), - instance_manager.instance_uniforms.binding().unwrap(), - view_resources.instance_visibility.as_entire_binding(), - view_resources - .second_pass_candidates_buffer - .as_entire_binding(), - view_resources - .visibility_buffer_software_raster_indirect_args_second - .as_entire_binding(), - view_resources - .visibility_buffer_hardware_raster_indirect_args_second - .as_entire_binding(), - resource_manager - .visibility_buffer_raster_clusters - .as_entire_binding(), - &view_resources.depth_pyramid.all_mips, - view_uniforms.clone(), - previous_view_uniforms.clone(), - )); - let culling_second = render_device.create_bind_group( - "meshlet_culling_second_bind_group", - &resource_manager.culling_bind_group_layout, - &entries, + let first_bvh_cull_ping = render_device.create_bind_group( + "meshlet_first_bvh_cull_ping_bind_group", + &resource_manager.first_bvh_cull_bind_group_layout, + &BindGroupEntries::sequential(( + &view_resources.previous_depth_pyramid, + view_uniforms.clone(), + previous_view_uniforms.clone(), + meshlet_mesh_manager.bvh_nodes.binding(), + instance_manager.instance_uniforms.binding().unwrap(), + view_resources + .first_bvh_cull_count_front + .as_entire_binding(), + view_resources.first_bvh_cull_count_back.as_entire_binding(), + view_resources + .first_bvh_cull_dispatch_back + .as_entire_binding(), + view_resources.first_bvh_cull_queue.as_entire_binding(), + view_resources.front_meshlet_cull_count.as_entire_binding(), + view_resources.back_meshlet_cull_count.as_entire_binding(), + view_resources + .front_meshlet_cull_dispatch + .as_entire_binding(), + view_resources + .back_meshlet_cull_dispatch + .as_entire_binding(), + view_resources.meshlet_cull_queue.as_entire_binding(), + view_resources + .second_bvh_cull_count_front + .as_entire_binding(), + view_resources + .second_bvh_cull_dispatch_front + .as_entire_binding(), + view_resources.second_bvh_cull_queue.as_entire_binding(), + )), + ); + + let first_bvh_cull_pong = render_device.create_bind_group( + "meshlet_first_bvh_cull_pong_bind_group", + &resource_manager.first_bvh_cull_bind_group_layout, + &BindGroupEntries::sequential(( + &view_resources.previous_depth_pyramid, + view_uniforms.clone(), + previous_view_uniforms.clone(), + meshlet_mesh_manager.bvh_nodes.binding(), + instance_manager.instance_uniforms.binding().unwrap(), + view_resources.first_bvh_cull_count_back.as_entire_binding(), + view_resources + .first_bvh_cull_count_front + .as_entire_binding(), + view_resources + .first_bvh_cull_dispatch_front + .as_entire_binding(), + view_resources.first_bvh_cull_queue.as_entire_binding(), + view_resources.front_meshlet_cull_count.as_entire_binding(), + view_resources.back_meshlet_cull_count.as_entire_binding(), + view_resources + .front_meshlet_cull_dispatch + .as_entire_binding(), + view_resources + .back_meshlet_cull_dispatch + .as_entire_binding(), + view_resources.meshlet_cull_queue.as_entire_binding(), + view_resources + .second_bvh_cull_count_front + .as_entire_binding(), + view_resources + .second_bvh_cull_dispatch_front + .as_entire_binding(), + view_resources.second_bvh_cull_queue.as_entire_binding(), + )), + ); + + let second_bvh_cull_ping = render_device.create_bind_group( + "meshlet_second_bvh_cull_ping_bind_group", + &resource_manager.second_bvh_cull_bind_group_layout, + &BindGroupEntries::sequential(( + &view_resources.previous_depth_pyramid, + view_uniforms.clone(), + previous_view_uniforms.clone(), + meshlet_mesh_manager.bvh_nodes.binding(), + instance_manager.instance_uniforms.binding().unwrap(), + view_resources + .second_bvh_cull_count_front + .as_entire_binding(), + view_resources + .second_bvh_cull_count_back + .as_entire_binding(), + view_resources + .second_bvh_cull_dispatch_back + .as_entire_binding(), + view_resources.second_bvh_cull_queue.as_entire_binding(), + view_resources.front_meshlet_cull_count.as_entire_binding(), + view_resources.back_meshlet_cull_count.as_entire_binding(), + view_resources + .front_meshlet_cull_dispatch + .as_entire_binding(), + view_resources + .back_meshlet_cull_dispatch + .as_entire_binding(), + view_resources.meshlet_cull_queue.as_entire_binding(), + )), + ); + + let second_bvh_cull_pong = render_device.create_bind_group( + "meshlet_second_bvh_cull_pong_bind_group", + &resource_manager.second_bvh_cull_bind_group_layout, + &BindGroupEntries::sequential(( + &view_resources.previous_depth_pyramid, + view_uniforms.clone(), + previous_view_uniforms.clone(), + meshlet_mesh_manager.bvh_nodes.binding(), + instance_manager.instance_uniforms.binding().unwrap(), + view_resources + .second_bvh_cull_count_back + .as_entire_binding(), + view_resources + .second_bvh_cull_count_front + .as_entire_binding(), + view_resources + .second_bvh_cull_dispatch_front + .as_entire_binding(), + view_resources.second_bvh_cull_queue.as_entire_binding(), + view_resources.front_meshlet_cull_count.as_entire_binding(), + view_resources.back_meshlet_cull_count.as_entire_binding(), + view_resources + .front_meshlet_cull_dispatch + .as_entire_binding(), + view_resources + .back_meshlet_cull_dispatch + .as_entire_binding(), + view_resources.meshlet_cull_queue.as_entire_binding(), + )), + ); + + let first_meshlet_cull = render_device.create_bind_group( + "meshlet_first_meshlet_cull_bind_group", + &resource_manager.first_meshlet_cull_bind_group_layout, + &BindGroupEntries::sequential(( + &view_resources.previous_depth_pyramid, + view_uniforms.clone(), + previous_view_uniforms.clone(), + meshlet_mesh_manager.meshlet_cull_data.binding(), + instance_manager.instance_uniforms.binding().unwrap(), + view_resources + .visibility_buffer_software_raster_indirect_args + .as_entire_binding(), + view_resources + .visibility_buffer_hardware_raster_indirect_args + .as_entire_binding(), + resource_manager + .visibility_buffer_raster_cluster_prev_counts + .as_entire_binding(), + resource_manager + .visibility_buffer_raster_clusters + .as_entire_binding(), + view_resources.front_meshlet_cull_count.as_entire_binding(), + view_resources.back_meshlet_cull_count.as_entire_binding(), + view_resources + .back_meshlet_cull_dispatch + .as_entire_binding(), + view_resources.meshlet_cull_queue.as_entire_binding(), + )), + ); + + let second_meshlet_cull = render_device.create_bind_group( + "meshlet_second_meshlet_cull_bind_group", + &resource_manager.second_meshlet_cull_bind_group_layout, + &BindGroupEntries::sequential(( + &view_resources.previous_depth_pyramid, + view_uniforms.clone(), + previous_view_uniforms.clone(), + meshlet_mesh_manager.meshlet_cull_data.binding(), + instance_manager.instance_uniforms.binding().unwrap(), + view_resources + .visibility_buffer_software_raster_indirect_args + .as_entire_binding(), + view_resources + .visibility_buffer_hardware_raster_indirect_args + .as_entire_binding(), + resource_manager + .visibility_buffer_raster_cluster_prev_counts + .as_entire_binding(), + resource_manager + .visibility_buffer_raster_clusters + .as_entire_binding(), + view_resources.back_meshlet_cull_count.as_entire_binding(), + view_resources.meshlet_cull_queue.as_entire_binding(), + )), ); let downsample_depth = view_resources.depth_pyramid.create_bind_group( @@ -745,22 +1085,6 @@ pub fn prepare_meshlet_view_bind_groups( &resource_manager.depth_pyramid_sampler, ); - let entries = BindGroupEntries::sequential(( - cluster_meshlet_ids.as_entire_binding(), - meshlet_mesh_manager.meshlets.binding(), - meshlet_mesh_manager.indices.binding(), - meshlet_mesh_manager.vertex_positions.binding(), - cluster_instance_ids.as_entire_binding(), - instance_manager.instance_uniforms.binding().unwrap(), - resource_manager - .visibility_buffer_raster_clusters - .as_entire_binding(), - resource_manager - .software_raster_cluster_count - .as_entire_binding(), - &view_resources.visibility_buffer.default_view, - view_uniforms.clone(), - )); let visibility_buffer_raster = render_device.create_bind_group( "meshlet_visibility_raster_buffer_bind_group", if view_resources.not_shadow_view { @@ -768,7 +1092,23 @@ pub fn prepare_meshlet_view_bind_groups( } else { &resource_manager.visibility_buffer_raster_shadow_view_bind_group_layout }, - &entries, + &BindGroupEntries::sequential(( + resource_manager + .visibility_buffer_raster_clusters + .as_entire_binding(), + meshlet_mesh_manager.meshlets.binding(), + meshlet_mesh_manager.indices.binding(), + meshlet_mesh_manager.vertex_positions.binding(), + instance_manager.instance_uniforms.binding().unwrap(), + resource_manager + .visibility_buffer_raster_cluster_prev_counts + .as_entire_binding(), + resource_manager + .software_raster_cluster_count + .as_entire_binding(), + &view_resources.visibility_buffer.default_view, + view_uniforms.clone(), + )), ); let resolve_depth = render_device.create_bind_group( @@ -782,34 +1122,35 @@ pub fn prepare_meshlet_view_bind_groups( ); let resolve_material_depth = view_resources.material_depth.as_ref().map(|_| { - let entries = BindGroupEntries::sequential(( - &view_resources.visibility_buffer.default_view, - cluster_instance_ids.as_entire_binding(), - instance_manager.instance_material_ids.binding().unwrap(), - )); render_device.create_bind_group( "meshlet_resolve_material_depth_bind_group", &resource_manager.resolve_material_depth_bind_group_layout, - &entries, + &BindGroupEntries::sequential(( + &view_resources.visibility_buffer.default_view, + resource_manager + .visibility_buffer_raster_clusters + .as_entire_binding(), + instance_manager.instance_material_ids.binding().unwrap(), + )), ) }); let material_shade = view_resources.material_depth.as_ref().map(|_| { - let entries = BindGroupEntries::sequential(( - &view_resources.visibility_buffer.default_view, - cluster_meshlet_ids.as_entire_binding(), - meshlet_mesh_manager.meshlets.binding(), - meshlet_mesh_manager.indices.binding(), - meshlet_mesh_manager.vertex_positions.binding(), - meshlet_mesh_manager.vertex_normals.binding(), - meshlet_mesh_manager.vertex_uvs.binding(), - cluster_instance_ids.as_entire_binding(), - instance_manager.instance_uniforms.binding().unwrap(), - )); render_device.create_bind_group( "meshlet_mesh_material_shade_bind_group", &resource_manager.material_shade_bind_group_layout, - &entries, + &BindGroupEntries::sequential(( + &view_resources.visibility_buffer.default_view, + resource_manager + .visibility_buffer_raster_clusters + .as_entire_binding(), + meshlet_mesh_manager.meshlets.binding(), + meshlet_mesh_manager.indices.binding(), + meshlet_mesh_manager.vertex_positions.binding(), + meshlet_mesh_manager.vertex_normals.binding(), + meshlet_mesh_manager.vertex_uvs.binding(), + instance_manager.instance_uniforms.binding().unwrap(), + )), ) }); @@ -817,46 +1158,77 @@ pub fn prepare_meshlet_view_bind_groups( .remap_1d_to_2d_dispatch_bind_group_layout .as_ref() .map(|layout| { - ( - render_device.create_bind_group( - "meshlet_remap_1d_to_2d_dispatch_first_bind_group", - layout, - &BindGroupEntries::sequential(( - view_resources - .visibility_buffer_software_raster_indirect_args_first - .as_entire_binding(), - resource_manager - .software_raster_cluster_count - .as_entire_binding(), - )), - ), - render_device.create_bind_group( - "meshlet_remap_1d_to_2d_dispatch_second_bind_group", - layout, - &BindGroupEntries::sequential(( - view_resources - .visibility_buffer_software_raster_indirect_args_second - .as_entire_binding(), - resource_manager - .software_raster_cluster_count - .as_entire_binding(), - )), - ), + render_device.create_bind_group( + "meshlet_remap_1d_to_2d_dispatch_bind_group", + layout, + &BindGroupEntries::sequential(( + view_resources + .visibility_buffer_software_raster_indirect_args + .as_entire_binding(), + resource_manager + .software_raster_cluster_count + .as_entire_binding(), + )), ) }); + let fill_counts = if resource_manager + .remap_1d_to_2d_dispatch_bind_group_layout + .is_some() + { + render_device.create_bind_group( + "meshlet_fill_counts_bind_group", + &resource_manager.fill_counts_bind_group_layout, + &BindGroupEntries::sequential(( + view_resources + .visibility_buffer_software_raster_indirect_args + .as_entire_binding(), + view_resources + .visibility_buffer_hardware_raster_indirect_args + .as_entire_binding(), + resource_manager + .visibility_buffer_raster_cluster_prev_counts + .as_entire_binding(), + resource_manager + .software_raster_cluster_count + .as_entire_binding(), + )), + ) + } else { + render_device.create_bind_group( + "meshlet_fill_counts_bind_group", + &resource_manager.fill_counts_bind_group_layout, + &BindGroupEntries::sequential(( + view_resources + .visibility_buffer_software_raster_indirect_args + .as_entire_binding(), + view_resources + .visibility_buffer_hardware_raster_indirect_args + .as_entire_binding(), + resource_manager + .visibility_buffer_raster_cluster_prev_counts + .as_entire_binding(), + )), + ) + }; + commands.entity(view_entity).insert(MeshletViewBindGroups { - first_node: Arc::clone(&first_node), - fill_cluster_buffers, clear_visibility_buffer, - culling_first, - culling_second, + first_instance_cull, + second_instance_cull, + first_bvh_cull_ping, + first_bvh_cull_pong, + second_bvh_cull_ping, + second_bvh_cull_pong, + first_meshlet_cull, + second_meshlet_cull, downsample_depth, visibility_buffer_raster, resolve_depth, resolve_material_depth, material_shade, remap_1d_to_2d_dispatch, + fill_counts, }); } } diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_hardware_raster.wgsl b/crates/bevy_pbr/src/meshlet/visibility_buffer_hardware_raster.wgsl index 3525d38e6da95..2a251443fbf50 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_hardware_raster.wgsl +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_hardware_raster.wgsl @@ -5,6 +5,7 @@ meshlet_cluster_instance_ids, meshlet_instance_uniforms, meshlet_raster_clusters, + meshlet_previous_raster_counts, meshlet_visibility_buffer, view, get_meshlet_triangle_count, @@ -27,17 +28,17 @@ struct VertexOutput { @vertex fn vertex(@builtin(instance_index) instance_index: u32, @builtin(vertex_index) vertex_index: u32) -> VertexOutput { - let cluster_id = meshlet_raster_clusters[meshlet_raster_cluster_rightmost_slot - instance_index]; - let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id]; - var meshlet = meshlets[meshlet_id]; + let cluster_in_draw = meshlet_previous_raster_counts[1] + instance_index; + let cluster_id = meshlet_raster_cluster_rightmost_slot - cluster_in_draw; + let instanced_offset = meshlet_raster_clusters[cluster_id]; + var meshlet = meshlets[instanced_offset.offset]; let triangle_id = vertex_index / 3u; if triangle_id >= get_meshlet_triangle_count(&meshlet) { return dummy_vertex(); } - let index_id = (triangle_id * 3u) + (vertex_index % 3u); + let index_id = vertex_index; let vertex_id = get_meshlet_vertex_id(meshlet.start_index_id + index_id); - let instance_id = meshlet_cluster_instance_ids[cluster_id]; - let instance_uniform = meshlet_instance_uniforms[instance_id]; + let instance_uniform = meshlet_instance_uniforms[instanced_offset.instance_id]; let vertex_position = get_meshlet_vertex_position(&meshlet, vertex_id); let world_from_local = affine3_to_square(instance_uniform.world_from_local); diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs b/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs index 20054d2d2f53a..160097fc50070 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs @@ -2,14 +2,16 @@ use super::{ pipelines::MeshletPipelines, resource_manager::{MeshletViewBindGroups, MeshletViewResources}, }; -use crate::{LightEntity, ShadowView, ViewLightEntities}; +use crate::{ + meshlet::resource_manager::ResourceManager, LightEntity, ShadowView, ViewLightEntities, +}; use bevy_color::LinearRgba; use bevy_core_pipeline::prepass::PreviousViewUniformOffset; use bevy_ecs::{ query::QueryState, world::{FromWorld, World}, }; -use bevy_math::{ops, UVec2}; +use bevy_math::UVec2; use bevy_render::{ camera::ExtractedCamera, render_graph::{Node, NodeRunError, RenderGraphContext}, @@ -17,7 +19,6 @@ use bevy_render::{ renderer::RenderContext, view::{ViewDepthTexture, ViewUniformOffset}, }; -use core::sync::atomic::Ordering; /// Rasterize meshlets into a depth buffer, and optional visibility buffer + material depth buffer for shading passes. pub struct MeshletVisibilityBufferRasterPassNode { @@ -76,11 +77,14 @@ impl Node for MeshletVisibilityBufferRasterPassNode { }; let Some(( - fill_cluster_buffers_pipeline, clear_visibility_buffer_pipeline, clear_visibility_buffer_shadow_view_pipeline, - culling_first_pipeline, - culling_second_pipeline, + first_instance_cull_pipeline, + second_instance_cull_pipeline, + first_bvh_cull_pipeline, + second_bvh_cull_pipeline, + first_meshlet_cull_pipeline, + second_meshlet_cull_pipeline, downsample_depth_first_pipeline, downsample_depth_second_pipeline, downsample_depth_first_shadow_view_pipeline, @@ -94,69 +98,60 @@ impl Node for MeshletVisibilityBufferRasterPassNode { resolve_depth_shadow_view_pipeline, resolve_material_depth_pipeline, remap_1d_to_2d_dispatch_pipeline, + fill_counts_pipeline, )) = MeshletPipelines::get(world) else { return Ok(()); }; - let first_node = meshlet_view_bind_groups - .first_node - .fetch_and(false, Ordering::SeqCst); - - let div_ceil = meshlet_view_resources.scene_cluster_count.div_ceil(128); - let thread_per_cluster_workgroups = ops::cbrt(div_ceil as f32).ceil() as u32; - render_context .command_encoder() .push_debug_group("meshlet_visibility_buffer_raster"); - if first_node { - fill_cluster_buffers_pass( - render_context, - &meshlet_view_bind_groups.fill_cluster_buffers, - fill_cluster_buffers_pipeline, - meshlet_view_resources.scene_instance_count, - ); - } + + let resource_manager = world.get_resource::().unwrap(); + render_context.command_encoder().clear_buffer( + &resource_manager.visibility_buffer_raster_cluster_prev_counts, + 0, + None, + ); + clear_visibility_buffer_pass( render_context, &meshlet_view_bind_groups.clear_visibility_buffer, clear_visibility_buffer_pipeline, meshlet_view_resources.view_size, ); - render_context.command_encoder().clear_buffer( - &meshlet_view_resources.second_pass_candidates_buffer, - 0, - None, - ); - cull_pass( - "culling_first", + + render_context + .command_encoder() + .push_debug_group("meshlet_first_pass"); + first_cull( render_context, - &meshlet_view_bind_groups.culling_first, + meshlet_view_bind_groups, + meshlet_view_resources, view_offset, previous_view_offset, - culling_first_pipeline, - thread_per_cluster_workgroups, - meshlet_view_resources.scene_cluster_count, - meshlet_view_resources.raster_cluster_rightmost_slot, - meshlet_view_bind_groups - .remap_1d_to_2d_dispatch - .as_ref() - .map(|(bg1, _)| bg1), + first_instance_cull_pipeline, + first_bvh_cull_pipeline, + first_meshlet_cull_pipeline, remap_1d_to_2d_dispatch_pipeline, ); raster_pass( true, render_context, - &meshlet_view_resources.visibility_buffer_software_raster_indirect_args_first, - &meshlet_view_resources.visibility_buffer_hardware_raster_indirect_args_first, + &meshlet_view_resources.visibility_buffer_software_raster_indirect_args, + &meshlet_view_resources.visibility_buffer_hardware_raster_indirect_args, &meshlet_view_resources.dummy_render_target.default_view, meshlet_view_bind_groups, view_offset, visibility_buffer_software_raster_pipeline, visibility_buffer_hardware_raster_pipeline, + fill_counts_pipeline, Some(camera), - meshlet_view_resources.raster_cluster_rightmost_slot, + meshlet_view_resources.rightmost_slot, ); + render_context.command_encoder().pop_debug_group(); + meshlet_view_resources.depth_pyramid.downsample_depth( "downsample_depth", render_context, @@ -165,35 +160,37 @@ impl Node for MeshletVisibilityBufferRasterPassNode { downsample_depth_first_pipeline, downsample_depth_second_pipeline, ); - cull_pass( - "culling_second", + + render_context + .command_encoder() + .push_debug_group("meshlet_second_pass"); + second_cull( render_context, - &meshlet_view_bind_groups.culling_second, + meshlet_view_bind_groups, + meshlet_view_resources, view_offset, previous_view_offset, - culling_second_pipeline, - thread_per_cluster_workgroups, - meshlet_view_resources.scene_cluster_count, - meshlet_view_resources.raster_cluster_rightmost_slot, - meshlet_view_bind_groups - .remap_1d_to_2d_dispatch - .as_ref() - .map(|(_, bg2)| bg2), + second_instance_cull_pipeline, + second_bvh_cull_pipeline, + second_meshlet_cull_pipeline, remap_1d_to_2d_dispatch_pipeline, ); raster_pass( false, render_context, - &meshlet_view_resources.visibility_buffer_software_raster_indirect_args_second, - &meshlet_view_resources.visibility_buffer_hardware_raster_indirect_args_second, + &meshlet_view_resources.visibility_buffer_software_raster_indirect_args, + &meshlet_view_resources.visibility_buffer_hardware_raster_indirect_args, &meshlet_view_resources.dummy_render_target.default_view, meshlet_view_bind_groups, view_offset, visibility_buffer_software_raster_pipeline, visibility_buffer_hardware_raster_pipeline, + fill_counts_pipeline, Some(camera), - meshlet_view_resources.raster_cluster_rightmost_slot, + meshlet_view_resources.rightmost_slot, ); + render_context.command_encoder().pop_debug_group(); + resolve_depth( render_context, view_depth.get_attachment(StoreOp::Store), @@ -248,40 +245,37 @@ impl Node for MeshletVisibilityBufferRasterPassNode { clear_visibility_buffer_shadow_view_pipeline, meshlet_view_resources.view_size, ); - render_context.command_encoder().clear_buffer( - &meshlet_view_resources.second_pass_candidates_buffer, - 0, - None, - ); - cull_pass( - "culling_first", + + render_context + .command_encoder() + .push_debug_group("meshlet_first_pass"); + first_cull( render_context, - &meshlet_view_bind_groups.culling_first, + meshlet_view_bind_groups, + meshlet_view_resources, view_offset, previous_view_offset, - culling_first_pipeline, - thread_per_cluster_workgroups, - meshlet_view_resources.scene_cluster_count, - meshlet_view_resources.raster_cluster_rightmost_slot, - meshlet_view_bind_groups - .remap_1d_to_2d_dispatch - .as_ref() - .map(|(bg1, _)| bg1), + first_instance_cull_pipeline, + first_bvh_cull_pipeline, + first_meshlet_cull_pipeline, remap_1d_to_2d_dispatch_pipeline, ); raster_pass( true, render_context, - &meshlet_view_resources.visibility_buffer_software_raster_indirect_args_first, - &meshlet_view_resources.visibility_buffer_hardware_raster_indirect_args_first, + &meshlet_view_resources.visibility_buffer_software_raster_indirect_args, + &meshlet_view_resources.visibility_buffer_hardware_raster_indirect_args, &meshlet_view_resources.dummy_render_target.default_view, meshlet_view_bind_groups, view_offset, visibility_buffer_software_raster_shadow_view_pipeline, shadow_visibility_buffer_hardware_raster_pipeline, + fill_counts_pipeline, None, - meshlet_view_resources.raster_cluster_rightmost_slot, + meshlet_view_resources.rightmost_slot, ); + render_context.command_encoder().pop_debug_group(); + meshlet_view_resources.depth_pyramid.downsample_depth( "downsample_depth", render_context, @@ -290,35 +284,37 @@ impl Node for MeshletVisibilityBufferRasterPassNode { downsample_depth_first_shadow_view_pipeline, downsample_depth_second_shadow_view_pipeline, ); - cull_pass( - "culling_second", + + render_context + .command_encoder() + .push_debug_group("meshlet_second_pass"); + second_cull( render_context, - &meshlet_view_bind_groups.culling_second, + meshlet_view_bind_groups, + meshlet_view_resources, view_offset, previous_view_offset, - culling_second_pipeline, - thread_per_cluster_workgroups, - meshlet_view_resources.scene_cluster_count, - meshlet_view_resources.raster_cluster_rightmost_slot, - meshlet_view_bind_groups - .remap_1d_to_2d_dispatch - .as_ref() - .map(|(_, bg2)| bg2), + second_instance_cull_pipeline, + second_bvh_cull_pipeline, + second_meshlet_cull_pipeline, remap_1d_to_2d_dispatch_pipeline, ); raster_pass( false, render_context, - &meshlet_view_resources.visibility_buffer_software_raster_indirect_args_second, - &meshlet_view_resources.visibility_buffer_hardware_raster_indirect_args_second, + &meshlet_view_resources.visibility_buffer_software_raster_indirect_args, + &meshlet_view_resources.visibility_buffer_hardware_raster_indirect_args, &meshlet_view_resources.dummy_render_target.default_view, meshlet_view_bind_groups, view_offset, visibility_buffer_software_raster_shadow_view_pipeline, shadow_visibility_buffer_hardware_raster_pipeline, + fill_counts_pipeline, None, - meshlet_view_resources.raster_cluster_rightmost_slot, + meshlet_view_resources.rightmost_slot, ); + render_context.command_encoder().pop_debug_group(); + resolve_depth( render_context, shadow_view.depth_attachment.get_attachment(StoreOp::Store), @@ -341,39 +337,6 @@ impl Node for MeshletVisibilityBufferRasterPassNode { } } -fn fill_cluster_buffers_pass( - render_context: &mut RenderContext, - fill_cluster_buffers_bind_group: &BindGroup, - fill_cluster_buffers_pass_pipeline: &ComputePipeline, - scene_instance_count: u32, -) { - let mut fill_cluster_buffers_pass_workgroups_x = scene_instance_count; - let mut fill_cluster_buffers_pass_workgroups_y = 1; - if scene_instance_count - > render_context - .render_device() - .limits() - .max_compute_workgroups_per_dimension - { - fill_cluster_buffers_pass_workgroups_x = (scene_instance_count as f32).sqrt().ceil() as u32; - fill_cluster_buffers_pass_workgroups_y = fill_cluster_buffers_pass_workgroups_x; - } - - let command_encoder = render_context.command_encoder(); - let mut fill_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { - label: Some("fill_cluster_buffers"), - timestamp_writes: None, - }); - fill_pass.set_pipeline(fill_cluster_buffers_pass_pipeline); - fill_pass.set_push_constants(0, &scene_instance_count.to_le_bytes()); - fill_pass.set_bind_group(0, fill_cluster_buffers_bind_group, &[]); - fill_pass.dispatch_workgroups( - fill_cluster_buffers_pass_workgroups_x, - fill_cluster_buffers_pass_workgroups_y, - 1, - ); -} - // TODO: Replace this with vkCmdClearColorImage once wgpu supports it fn clear_visibility_buffer_pass( render_context: &mut RenderContext, @@ -397,82 +360,231 @@ fn clear_visibility_buffer_pass( ); } -fn cull_pass( - label: &'static str, +fn first_cull( render_context: &mut RenderContext, - culling_bind_group: &BindGroup, + meshlet_view_bind_groups: &MeshletViewBindGroups, + meshlet_view_resources: &MeshletViewResources, view_offset: &ViewUniformOffset, previous_view_offset: &PreviousViewUniformOffset, - culling_pipeline: &ComputePipeline, - culling_workgroups: u32, - scene_cluster_count: u32, - raster_cluster_rightmost_slot: u32, - remap_1d_to_2d_dispatch_bind_group: Option<&BindGroup>, - remap_1d_to_2d_dispatch_pipeline: Option<&ComputePipeline>, + first_instance_cull_pipeline: &ComputePipeline, + first_bvh_cull_pipeline: &ComputePipeline, + first_meshlet_cull_pipeline: &ComputePipeline, + remap_1d_to_2d_pipeline: Option<&ComputePipeline>, ) { - let max_compute_workgroups_per_dimension = render_context - .render_device() - .limits() - .max_compute_workgroups_per_dimension; + let workgroups = meshlet_view_resources.scene_instance_count.div_ceil(128); + cull_pass( + "meshlet_first_instance_cull", + render_context, + &meshlet_view_bind_groups.first_instance_cull, + view_offset, + previous_view_offset, + first_instance_cull_pipeline, + &[meshlet_view_resources.scene_instance_count], + ) + .dispatch_workgroups(workgroups, 1, 1); + render_context + .command_encoder() + .push_debug_group("meshlet_first_bvh_cull"); + let mut ping = true; + for _ in 0..meshlet_view_resources.max_bvh_depth { + cull_pass( + "meshlet_first_bvh_cull_dispatch", + render_context, + if ping { + &meshlet_view_bind_groups.first_bvh_cull_ping + } else { + &meshlet_view_bind_groups.first_bvh_cull_pong + }, + view_offset, + previous_view_offset, + first_bvh_cull_pipeline, + &[ping as u32, meshlet_view_resources.rightmost_slot], + ) + .dispatch_workgroups_indirect( + if ping { + &meshlet_view_resources.first_bvh_cull_dispatch_front + } else { + &meshlet_view_resources.first_bvh_cull_dispatch_back + }, + 0, + ); + render_context.command_encoder().clear_buffer( + if ping { + &meshlet_view_resources.first_bvh_cull_count_front + } else { + &meshlet_view_resources.first_bvh_cull_count_back + }, + 0, + Some(4), + ); + render_context.command_encoder().clear_buffer( + if ping { + &meshlet_view_resources.first_bvh_cull_dispatch_front + } else { + &meshlet_view_resources.first_bvh_cull_dispatch_back + }, + 0, + Some(4), + ); + ping = !ping; + } + render_context.command_encoder().pop_debug_group(); + + let mut pass = cull_pass( + "meshlet_first_meshlet_cull", + render_context, + &meshlet_view_bind_groups.first_meshlet_cull, + view_offset, + previous_view_offset, + first_meshlet_cull_pipeline, + &[meshlet_view_resources.rightmost_slot], + ); + pass.dispatch_workgroups_indirect(&meshlet_view_resources.front_meshlet_cull_dispatch, 0); + remap_1d_to_2d( + pass, + remap_1d_to_2d_pipeline, + meshlet_view_bind_groups.remap_1d_to_2d_dispatch.as_ref(), + ); +} + +fn second_cull( + render_context: &mut RenderContext, + meshlet_view_bind_groups: &MeshletViewBindGroups, + meshlet_view_resources: &MeshletViewResources, + view_offset: &ViewUniformOffset, + previous_view_offset: &PreviousViewUniformOffset, + second_instance_cull_pipeline: &ComputePipeline, + second_bvh_cull_pipeline: &ComputePipeline, + second_meshlet_cull_pipeline: &ComputePipeline, + remap_1d_to_2d_pipeline: Option<&ComputePipeline>, +) { + cull_pass( + "meshlet_second_instance_cull", + render_context, + &meshlet_view_bind_groups.second_instance_cull, + view_offset, + previous_view_offset, + second_instance_cull_pipeline, + &[meshlet_view_resources.scene_instance_count], + ) + .dispatch_workgroups_indirect(&meshlet_view_resources.second_pass_dispatch, 0); + + render_context + .command_encoder() + .push_debug_group("meshlet_second_bvh_cull"); + let mut ping = true; + for _ in 0..meshlet_view_resources.max_bvh_depth { + cull_pass( + "meshlet_second_bvh_cull_dispatch", + render_context, + if ping { + &meshlet_view_bind_groups.second_bvh_cull_ping + } else { + &meshlet_view_bind_groups.second_bvh_cull_pong + }, + view_offset, + previous_view_offset, + second_bvh_cull_pipeline, + &[ping as u32, meshlet_view_resources.rightmost_slot], + ) + .dispatch_workgroups_indirect( + if ping { + &meshlet_view_resources.second_bvh_cull_dispatch_front + } else { + &meshlet_view_resources.second_bvh_cull_dispatch_back + }, + 0, + ); + ping = !ping; + } + render_context.command_encoder().pop_debug_group(); + + let mut pass = cull_pass( + "meshlet_second_meshlet_cull", + render_context, + &meshlet_view_bind_groups.second_meshlet_cull, + view_offset, + previous_view_offset, + second_meshlet_cull_pipeline, + &[meshlet_view_resources.rightmost_slot], + ); + pass.dispatch_workgroups_indirect(&meshlet_view_resources.back_meshlet_cull_dispatch, 0); + remap_1d_to_2d( + pass, + remap_1d_to_2d_pipeline, + meshlet_view_bind_groups.remap_1d_to_2d_dispatch.as_ref(), + ); +} + +fn cull_pass<'a>( + label: &'static str, + render_context: &'a mut RenderContext, + bind_group: &'a BindGroup, + view_offset: &'a ViewUniformOffset, + previous_view_offset: &'a PreviousViewUniformOffset, + pipeline: &'a ComputePipeline, + push_constants: &[u32], +) -> ComputePass<'a> { let command_encoder = render_context.command_encoder(); - let mut cull_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { + let mut pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { label: Some(label), timestamp_writes: None, }); - cull_pass.set_pipeline(culling_pipeline); - cull_pass.set_push_constants( + pass.set_pipeline(pipeline); + pass.set_bind_group( 0, - bytemuck::cast_slice(&[scene_cluster_count, raster_cluster_rightmost_slot]), - ); - cull_pass.set_bind_group( - 0, - culling_bind_group, + bind_group, &[view_offset.offset, previous_view_offset.offset], ); - cull_pass.dispatch_workgroups(culling_workgroups, culling_workgroups, culling_workgroups); + pass.set_push_constants(0, bytemuck::cast_slice(push_constants)); + pass +} - if let (Some(remap_1d_to_2d_dispatch_pipeline), Some(remap_1d_to_2d_dispatch_bind_group)) = ( - remap_1d_to_2d_dispatch_pipeline, - remap_1d_to_2d_dispatch_bind_group, - ) { - cull_pass.set_pipeline(remap_1d_to_2d_dispatch_pipeline); - cull_pass.set_push_constants(0, &max_compute_workgroups_per_dimension.to_be_bytes()); - cull_pass.set_bind_group(0, remap_1d_to_2d_dispatch_bind_group, &[]); - cull_pass.dispatch_workgroups(1, 1, 1); +fn remap_1d_to_2d( + mut pass: ComputePass, + pipeline: Option<&ComputePipeline>, + bind_group: Option<&BindGroup>, +) { + if let (Some(pipeline), Some(bind_group)) = (pipeline, bind_group) { + pass.set_pipeline(pipeline); + pass.set_bind_group(0, bind_group, &[]); + pass.dispatch_workgroups(1, 1, 1); } } fn raster_pass( first_pass: bool, render_context: &mut RenderContext, - visibility_buffer_hardware_software_indirect_args: &Buffer, + visibility_buffer_software_raster_indirect_args: &Buffer, visibility_buffer_hardware_raster_indirect_args: &Buffer, dummy_render_target: &TextureView, meshlet_view_bind_groups: &MeshletViewBindGroups, view_offset: &ViewUniformOffset, - visibility_buffer_hardware_software_pipeline: &ComputePipeline, + visibility_buffer_software_raster_pipeline: &ComputePipeline, visibility_buffer_hardware_raster_pipeline: &RenderPipeline, + fill_counts_pipeline: &ComputePipeline, camera: Option<&ExtractedCamera>, raster_cluster_rightmost_slot: u32, ) { - let command_encoder = render_context.command_encoder(); - let mut software_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { - label: Some(if first_pass { - "raster_software_first" - } else { - "raster_software_second" - }), - timestamp_writes: None, - }); - software_pass.set_pipeline(visibility_buffer_hardware_software_pipeline); + let mut software_pass = + render_context + .command_encoder() + .begin_compute_pass(&ComputePassDescriptor { + label: Some(if first_pass { + "raster_software_first" + } else { + "raster_software_second" + }), + timestamp_writes: None, + }); + software_pass.set_pipeline(visibility_buffer_software_raster_pipeline); software_pass.set_bind_group( 0, &meshlet_view_bind_groups.visibility_buffer_raster, &[view_offset.offset], ); - software_pass - .dispatch_workgroups_indirect(visibility_buffer_hardware_software_indirect_args, 0); + software_pass.dispatch_workgroups_indirect(visibility_buffer_software_raster_indirect_args, 0); drop(software_pass); let mut hardware_pass = render_context.begin_tracked_render_pass(RenderPassDescriptor { @@ -508,6 +620,18 @@ fn raster_pass( &[view_offset.offset], ); hardware_pass.draw_indirect(visibility_buffer_hardware_raster_indirect_args, 0); + drop(hardware_pass); + + let mut fill_counts_pass = + render_context + .command_encoder() + .begin_compute_pass(&ComputePassDescriptor { + label: Some("fill_counts"), + timestamp_writes: None, + }); + fill_counts_pass.set_pipeline(fill_counts_pipeline); + fill_counts_pass.set_bind_group(0, &meshlet_view_bind_groups.fill_counts, &[]); + fill_counts_pass.dispatch_workgroups(1, 1, 1); } fn resolve_depth( diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl b/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl index 4c56c5874ae2f..8d8a22b943ea6 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl @@ -4,9 +4,8 @@ meshlet_bindings::{ Meshlet, meshlet_visibility_buffer, - meshlet_cluster_meshlet_ids, + meshlet_raster_clusters, meshlets, - meshlet_cluster_instance_ids, meshlet_instance_uniforms, get_meshlet_vertex_id, get_meshlet_vertex_position, @@ -106,7 +105,8 @@ struct VertexOutput { fn resolve_vertex_output(frag_coord: vec4) -> VertexOutput { let packed_ids = u32(textureLoad(meshlet_visibility_buffer, vec2(frag_coord.xy)).r); let cluster_id = packed_ids >> 7u; - let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id]; + let instanced_offset = meshlet_raster_clusters[cluster_id]; + let meshlet_id = instanced_offset.offset; var meshlet = meshlets[meshlet_id]; let triangle_id = extractBits(packed_ids, 0u, 7u); @@ -116,7 +116,7 @@ fn resolve_vertex_output(frag_coord: vec4) -> VertexOutput { let vertex_1 = load_vertex(&meshlet, vertex_ids[1]); let vertex_2 = load_vertex(&meshlet, vertex_ids[2]); - let instance_id = meshlet_cluster_instance_ids[cluster_id]; + let instance_id = instanced_offset.instance_id; var instance_uniform = meshlet_instance_uniforms[instance_id]; let world_from_local = affine3_to_square(instance_uniform.world_from_local); diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl b/crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl index 60f6f1b3ea658..0ddfff896438d 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl @@ -5,6 +5,7 @@ meshlet_cluster_instance_ids, meshlet_instance_uniforms, meshlet_raster_clusters, + meshlet_previous_raster_counts, meshlet_software_raster_cluster_count, meshlet_visibility_buffer, view, @@ -40,12 +41,11 @@ fn rasterize_cluster( if workgroup_id_1d >= meshlet_software_raster_cluster_count { return; } #endif - let cluster_id = meshlet_raster_clusters[workgroup_id_1d]; - let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id]; - var meshlet = meshlets[meshlet_id]; + let cluster_id = workgroup_id_1d + meshlet_previous_raster_counts[0]; + let instanced_offset = meshlet_raster_clusters[cluster_id]; + var meshlet = meshlets[instanced_offset.offset]; - let instance_id = meshlet_cluster_instance_ids[cluster_id]; - let instance_uniform = meshlet_instance_uniforms[instance_id]; + let instance_uniform = meshlet_instance_uniforms[instanced_offset.instance_id]; let world_from_local = affine3_to_square(instance_uniform.world_from_local); // Load and project 1 vertex per thread, and then again if there are more than 128 vertices in the meshlet