summaryrefslogtreecommitdiff
path: root/src/gallium
diff options
context:
space:
mode:
authorKarol Herbst <kherbst@redhat.com>2022-11-18 15:51:18 +0100
committerMarge Bot <emma+marge@anholt.net>2023-03-31 20:29:00 +0000
commitac993ae8287ea374a073c7a99fac84dfd773c475 (patch)
tree643cf6454812bf462828f17d10b1c0b44b63483a /src/gallium
parentc7dd3677dc6632f17d3b69f006f63492b6f9e0be (diff)
rusticl/kernel: make use of cso info
Signed-off-by: Karol Herbst <kherbst@redhat.com> Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19855>
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/frontends/rusticl/api/kernel.rs5
-rw-r--r--src/gallium/frontends/rusticl/core/device.rs5
-rw-r--r--src/gallium/frontends/rusticl/core/kernel.rs107
-rw-r--r--src/gallium/frontends/rusticl/mesa/pipe/context.rs9
4 files changed, 76 insertions, 50 deletions
diff --git a/src/gallium/frontends/rusticl/api/kernel.rs b/src/gallium/frontends/rusticl/api/kernel.rs
index e9d5ec205b1..2cb7ef64b6d 100644
--- a/src/gallium/frontends/rusticl/api/kernel.rs
+++ b/src/gallium/frontends/rusticl/api/kernel.rs
@@ -89,11 +89,10 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size),
CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::<cl_ulong>(kernel.local_mem_size(&dev)),
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
- cl_prop::<usize>(dev.subgroups() as usize)
+ cl_prop::<usize>(kernel.preferred_simd_size(&dev))
}
CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::<cl_ulong>(kernel.priv_mem_size(&dev)),
- // TODO
- CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(dev.subgroups() as usize),
+ CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(&dev)),
// CL_INVALID_VALUE if param_name is not one of the supported values
_ => return Err(CL_INVALID_VALUE),
})
diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs
index 77c51b0bcde..f2450c89ef9 100644
--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@@ -75,6 +75,7 @@ pub trait HelperContextWrapper {
fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
fn delete_compute_state(&self, cso: *mut c_void);
+ fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;
fn unmap(&self, tx: PipeTransfer);
}
@@ -159,6 +160,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> {
self.lock.delete_compute_state(cso)
}
+ fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
+ self.lock.compute_state_info(state)
+ }
+
fn unmap(&self, tx: PipeTransfer) {
tx.with_ctx(&self.lock);
}
diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs
index 2810f240888..e1ce6f0d4bc 100644
--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@@ -258,6 +258,7 @@ struct KernelDevStateInner {
nir: NirShader,
constant_buffer: Option<Arc<PipeResource>>,
cso: *mut c_void,
+ info: pipe_compute_state_object_info,
}
struct KernelDevState {
@@ -279,21 +280,25 @@ impl KernelDevState {
let states = nirs
.into_iter()
.map(|(dev, nir)| {
- let cso = if dev.shareable_shaders() {
- dev.helper_ctx()
- .create_compute_state(&nir, nir.shared_size())
- } else {
- ptr::null_mut()
- };
-
+ let mut cso = dev
+ .helper_ctx()
+ .create_compute_state(&nir, nir.shared_size());
+ let info = dev.helper_ctx().compute_state_info(cso);
let cb = Self::create_nir_constant_buffer(&dev, &nir);
+ // if we can't share the cso between threads, destroy it now.
+ if !dev.shareable_shaders() {
+ dev.helper_ctx().delete_compute_state(cso);
+ cso = ptr::null_mut();
+ };
+
(
dev,
KernelDevStateInner {
nir: nir,
constant_buffer: cb,
cso: cso,
+ info: info,
},
)
})
@@ -829,44 +834,6 @@ fn extract<'a, const S: usize>(buf: &'a mut &[u8]) -> &'a [u8; S] {
val.try_into().unwrap()
}
-fn optimize_local_size(d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
- let mut threads = d.max_threads_per_block() as u32;
- let dim_threads = d.max_block_sizes();
- let subgroups = d.subgroups();
-
- if !block.contains(&0) {
- for i in 0..3 {
- // we already made sure everything is fine
- grid[i] /= block[i];
- }
- return;
- }
-
- for i in 0..3 {
- let t = cmp::min(threads, dim_threads[i] as u32);
- let gcd = gcd(t, grid[i]);
-
- block[i] = gcd;
- grid[i] /= gcd;
-
- // update limits
- threads /= block[i];
- }
-
- // if we didn't fill the subgroup we can do a bit better if we have threads remaining
- let total_threads = block[0] * block[1] * block[2];
- if threads != 1 && total_threads < subgroups {
- for i in 0..3 {
- if grid[i] * total_threads < threads {
- block[i] *= grid[i];
- grid[i] = 1;
- // can only do it once as nothing is cleanly divisible
- break;
- }
- }
- }
-}
-
impl Kernel {
pub fn new(name: String, prog: Arc<Program>, args: Vec<spirv::SPIRVKernelArg>) -> Arc<Kernel> {
let (mut nirs, args, internal_args, attributes_string) =
@@ -895,6 +862,44 @@ impl Kernel {
})
}
+ fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
+ let mut threads = self.max_threads_per_block(d) as u32;
+ let dim_threads = d.max_block_sizes();
+ let subgroups = self.preferred_simd_size(d) as u32;
+
+ if !block.contains(&0) {
+ for i in 0..3 {
+ // we already made sure everything is fine
+ grid[i] /= block[i];
+ }
+ return;
+ }
+
+ for i in 0..3 {
+ let t = cmp::min(threads, dim_threads[i] as u32);
+ let gcd = gcd(t, grid[i]);
+
+ block[i] = gcd;
+ grid[i] /= gcd;
+
+ // update limits
+ threads /= block[i];
+ }
+
+ // if we didn't fill the subgroup we can do a bit better if we have threads remaining
+ let total_threads = block[0] * block[1] * block[2];
+ if threads != 1 && total_threads < subgroups {
+ for i in 0..3 {
+ if grid[i] * total_threads < threads {
+ block[i] *= grid[i];
+ grid[i] = 1;
+ // can only do it once as nothing is cleanly divisible
+ break;
+ }
+ }
+ }
+ }
+
// the painful part is, that host threads are allowed to modify the kernel object once it was
// enqueued, so return a closure with all req data included.
pub fn launch(
@@ -928,7 +933,7 @@ impl Kernel {
&[0; 4]
};
- optimize_local_size(&q.device, &mut grid, &mut block);
+ self.optimize_local_size(&q.device, &mut grid, &mut block);
for (arg, val) in self.args.iter().zip(&self.values) {
if arg.dead {
@@ -1225,7 +1230,15 @@ impl Kernel {
}
pub fn priv_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
- self.dev_state.get(dev).nir.scratch_size() as cl_ulong
+ self.dev_state.get(dev).info.private_memory.into()
+ }
+
+ pub fn max_threads_per_block(&self, dev: &Device) -> usize {
+ self.dev_state.get(dev).info.max_threads as usize
+ }
+
+ pub fn preferred_simd_size(&self, dev: &Device) -> usize {
+ self.dev_state.get(dev).info.preferred_simd_size as usize
}
pub fn local_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
diff --git a/src/gallium/frontends/rusticl/mesa/pipe/context.rs b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
index a4d83fc69d2..abd234bb0b9 100644
--- a/src/gallium/frontends/rusticl/mesa/pipe/context.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
@@ -319,6 +319,14 @@ impl PipeContext {
unsafe { self.pipe.as_ref().delete_compute_state.unwrap()(self.pipe.as_ptr(), state) }
}
+ pub fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
+ let mut info = pipe_compute_state_object_info::default();
+ unsafe {
+ self.pipe.as_ref().get_compute_state_info.unwrap()(self.pipe.as_ptr(), state, &mut info)
+ }
+ info
+ }
+
pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
}
@@ -530,6 +538,7 @@ fn has_required_cbs(context: &pipe_context) -> bool {
& has_required_feature!(context, delete_compute_state)
& has_required_feature!(context, delete_sampler_state)
& has_required_feature!(context, flush)
+ & has_required_feature!(context, get_compute_state_info)
& has_required_feature!(context, launch_grid)
& has_required_feature!(context, memory_barrier)
& has_required_feature!(context, resource_copy_region)