summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2025-03-26 10:19:40 -0400
committerMarge Bot <emma+marge@anholt.net>2025-03-27 01:59:19 +0000
commit219b2cde13e52f1248b475cb1d7cee453f1a0c65 (patch)
tree0a031f9302bf86fa065519735d490341241bcd20
parent77fb09c8cdaf2b5fb48d7f76a03798fa4286ad1b (diff)
radeonsi: remove clover support
Only Rusticl is supported. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34215>
-rw-r--r--src/amd/common/amd_kernel_code_t.h553
-rw-r--r--src/gallium/drivers/radeonsi/si_compute.c365
-rw-r--r--src/gallium/drivers/radeonsi/si_get.c7
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.c5
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.h3
5 files changed, 35 insertions, 898 deletions
diff --git a/src/amd/common/amd_kernel_code_t.h b/src/amd/common/amd_kernel_code_t.h
deleted file mode 100644
index 05767d5153b..00000000000
--- a/src/amd/common/amd_kernel_code_t.h
+++ /dev/null
@@ -1,553 +0,0 @@
-/*
- * Copyright 2015,2016 Advanced Micro Devices, Inc.
- *
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef AMDKERNELCODET_H
-#define AMDKERNELCODET_H
-
-//---------------------------------------------------------------------------//
-// AMD Kernel Code, and its dependencies //
-//---------------------------------------------------------------------------//
-
-// Sets val bits for specified mask in specified dst packed instance.
-#define AMD_HSA_BITS_SET(dst, mask, val) \
- dst &= (~(1 << mask##_SHIFT) & ~mask); \
- dst |= (((val) << mask##_SHIFT) & mask)
-
-// Gets bits for specified mask from specified src packed instance.
-#define AMD_HSA_BITS_GET(src, mask) ((src & mask) >> mask##_SHIFT)
-
-/* Every amd_*_code_t has the following properties, which are composed of
- * a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
- * bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount
- * (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.
- *
- * (Note that bit fields cannot be used as their layout is
- * implementation defined in the C standard and so cannot be used to
- * specify an ABI)
- */
-enum amd_code_property_mask_t
-{
-
- /* Enable the setup of the SGPR user data registers
- * (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
- * for initial register state.
- *
- * The total number of SGPRuser data registers requested must not
- * exceed 16. Any requests beyond 16 will be ignored.
- *
- * Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
- * SGPR user data registers enabled up to 16).
- */
-
- AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
- AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER =
- ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
-
- AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR =
- ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
-
- AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
- AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR =
- ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
-
- AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
- AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR =
- ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
-
- AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
- AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID =
- ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
-
- AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
- AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT =
- ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
-
- AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
- AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE =
- ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
-
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X =
- ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
-
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y =
- ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
-
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z =
- ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
-
- AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT = 10,
- AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32 =
- ((1 << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT,
-
- AMD_CODE_PROPERTY_RESERVED1_SHIFT = 11,
- AMD_CODE_PROPERTY_RESERVED1_WIDTH = 5,
- AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1)
- << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
-
- /* Control wave ID base counter for GDS ordered-append. Used to set
- * COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
- * ORDERED_APPEND_MODE also needs to be settable)
- */
- AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
- AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
- AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS =
- ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1)
- << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
-
- /* The interleave (swizzle) element size in bytes required by the
- * code for private memory. This must be 2, 4, 8 or 16. This value
- * is provided to the finalizer when it is invoked and is recorded
- * here. The hardware will interleave the memory requests of each
- * lane of a wavefront by this element size to ensure each
- * work-item gets a distinct memory memory location. Therefore, the
- * finalizer ensures that all load and store operations done to
- * private memory do not exceed this size. For example, if the
- * element size is 4 (32-bits or dword) and a 64-bit value must be
- * loaded, the finalizer will generate two 32-bit loads. This
- * ensures that the interleaving will get the work-item
- * specific dword for both halves of the 64-bit value. If it just
- * did a 64-bit load then it would get one dword which belonged to
- * its own work-item, but the second dword would belong to the
- * adjacent lane work-item since the interleaving is in dwords.
- *
- * The value used must match the value that the runtime configures
- * the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
- * is generally DWORD.
- *
- * USE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
- */
- AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
- AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
- AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE =
- ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1)
- << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
-
- /* Are global memory addresses 64 bits. Must match
- * amd_kernel_code_t.hsail_machine_model ==
- * HSA_MACHINE_LARGE. Must also match
- * SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
- * SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
- */
- AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
- AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
- AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1)
- << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
-
- /* Indicate if the generated ISA is using a dynamically sized call
- * stack. This can happen if calls are implemented using a call
- * stack and recursion, alloca or calls to indirect functions are
- * present. In these cases the Finalizer cannot compute the total
- * private segment size at compile time. In this case the
- * workitem_private_segment_byte_size only specifies the statically
- * know private segment size, and additional space must be added
- * for the call stack.
- */
- AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
- AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
- AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK =
- ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1)
- << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
-
- /* Indicate if code generated has support for debugging. */
- AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
- AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
- AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1)
- << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
-
- AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
- AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
- AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1)
- << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
-
- AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
- AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
- AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1)
- << AMD_CODE_PROPERTY_RESERVED2_SHIFT
-};
-
-/* AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
- * Code Object to set up the hardware to execute the kernel dispatch.
- *
- * Initial Kernel Register State.
- *
- * Initial kernel register state will be set up by CP/SPI prior to the start
- * of execution of every wavefront. This is limited by the constraints of the
- * current hardware.
- *
- * The order of the SGPR registers is defined, but the Finalizer can specify
- * which ones are actually setup in the amd_kernel_code_t object using the
- * enable_sgpr_* bit fields. The register numbers used for enabled registers
- * are dense starting at SGPR0: the first enabled register is SGPR0, the next
- * enabled register is SGPR1 etc.; disabled registers do not have an SGPR
- * number.
- *
- * The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and
- * apply to all waves of the grid. It is possible to specify more than 16 User
- * SGPRs using the enable_sgpr_* bit fields, in which case only the first 16
- * are actually initialized. These are then immediately followed by the System
- * SGPRs that are set up by ADC/SPI and can have different values for each wave
- * of the grid dispatch.
- *
- * SGPR register initial state is defined as follows:
- *
- * Private Segment Buffer (enable_sgpr_private_segment_buffer):
- * Number of User SGPR registers: 4. V# that can be used, together with
- * Scratch Wave Offset as an offset, to access the Private/Spill/Arg
- * segments using a segment address. It must be set as follows:
- * - Base address: of the scratch memory area used by the dispatch. It
- * does not include the scratch wave offset. It will be the per process
- * SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for
- * example there may be a per pipe offset, or per AQL Queue offset).
- * - Stride + data_format: Element Size * Index Stride (???)
- * - Cache swizzle: ???
- * - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for
- * scratch)
- * - Num records: Flat Scratch Work Item Size / Element Size (???)
- * - Dst_sel_*: ???
- * - Num_format: ???
- * - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must
- * agree with amd_kernel_code_t.privateElementSize)
- * - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must
- * be number of wavefront lanes for scratch, must agree with
- * amd_kernel_code_t.wavefrontSize)
- * - Add tid enable: 1
- * - ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
- * - Hash_enable: ???
- * - Heap: ???
- * - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
- * - Type: 0 (a buffer) (???)
- *
- * Dispatch Ptr (enable_sgpr_dispatch_ptr):
- * Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet
- * for kernel actually executing.
- *
- * Queue Ptr (enable_sgpr_queue_ptr):
- * Number of User SGPR registers: 2. 64 bit address of AmdQueue object for
- * AQL queue on which the dispatch packet was queued.
- *
- * Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr):
- * Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This
- * is directly copied from the kernargPtr in the dispatch packet. Having CP
- * load it once avoids loading it at the beginning of every wavefront.
- *
- * Dispatch Id (enable_sgpr_dispatch_id):
- * Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch
- * packet being executed.
- *
- * Flat Scratch Init (enable_sgpr_flat_scratch_init):
- * Number of User SGPR registers: 2. This is 2 SGPRs.
- *
- * For CI/VI:
- * The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE
- * to base of memory for scratch for this dispatch. This is the same offset
- * used in computing the Scratch Segment Buffer base address. The value of
- * Scratch Wave Offset must be added by the kernel code and moved to
- * SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.
- *
- * The second SGPR is 32 bit byte size of a single work-item's scratch
- * memory usage. This is directly loaded from the dispatch packet Private
- * Segment Byte Size and rounded up to a multiple of DWORD.
- *
- * \todo [Does CP need to round this to >4 byte alignment?]
- *
- * The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
- * flat memory instructions. Having CP load it once avoids loading it at
- * the beginning of every wavefront.
- *
- * Private Segment Size (enable_sgpr_private_segment_size):
- * Number of User SGPR registers: 1. The 32 bit byte size of a single
- * work-item's scratch memory allocation. This is the value from the dispatch
- * packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.
- *
- * \todo [Does CP need to round this to >4 byte alignment?]
- *
- * Having CP load it once avoids loading it at the beginning of every
- * wavefront.
- *
- * \todo [This will not be used for CI/VI since it is the same value as
- * the second SGPR of Flat Scratch Init.
- *
- * Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x):
- * Number of User SGPR registers: 1. 32 bit count of the number of
- * work-groups in the X dimension for the grid being executed. Computed from
- * the fields in the HsaDispatchPacket as
- * ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).
- *
- * Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y):
- * Number of User SGPR registers: 1. 32 bit count of the number of
- * work-groups in the Y dimension for the grid being executed. Computed from
- * the fields in the HsaDispatchPacket as
- * ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).
- *
- * Only initialized if <16 previous SGPRs initialized.
- *
- * Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z):
- * Number of User SGPR registers: 1. 32 bit count of the number of
- * work-groups in the Z dimension for the grid being executed. Computed
- * from the fields in the HsaDispatchPacket as
- * ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).
- *
- * Only initialized if <16 previous SGPRs initialized.
- *
- * Work-Group Id X (enable_sgpr_workgroup_id_x):
- * Number of System SGPR registers: 1. 32 bit work group id in X dimension
- * of grid for wavefront. Always present.
- *
- * Work-Group Id Y (enable_sgpr_workgroup_id_y):
- * Number of System SGPR registers: 1. 32 bit work group id in Y dimension
- * of grid for wavefront.
- *
- * Work-Group Id Z (enable_sgpr_workgroup_id_z):
- * Number of System SGPR registers: 1. 32 bit work group id in Z dimension
- * of grid for wavefront. If present then Work-group Id Y will also be
- * present
- *
- * Work-Group Info (enable_sgpr_workgroup_info):
- * Number of System SGPR registers: 1. {first_wave, 14'b0000,
- * ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}
- *
- * Private Segment Wave Byte Offset
- * (enable_sgpr_private_segment_wave_byte_offset):
- * Number of System SGPR registers: 1. 32 bit byte offset from base of
- * dispatch scratch base. Must be used as an offset with Private/Spill/Arg
- * segment address when using Scratch Segment Buffer. It must be added to
- * Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.
- *
- *
- * The order of the VGPR registers is defined, but the Finalizer can specify
- * which ones are actually setup in the amd_kernel_code_t object using the
- * enableVgpr* bit fields. The register numbers used for enabled registers
- * are dense starting at VGPR0: the first enabled register is VGPR0, the next
- * enabled register is VGPR1 etc.; disabled registers do not have an VGPR
- * number.
- *
- * VGPR register initial state is defined as follows:
- *
- * Work-Item Id X (always initialized):
- * Number of registers: 1. 32 bit work item id in X dimension of work-group
- * for wavefront lane.
- *
- * Work-Item Id X (enable_vgpr_workitem_id > 0):
- * Number of registers: 1. 32 bit work item id in Y dimension of work-group
- * for wavefront lane.
- *
- * Work-Item Id X (enable_vgpr_workitem_id > 0):
- * Number of registers: 1. 32 bit work item id in Z dimension of work-group
- * for wavefront lane.
- *
- *
- * The setting of registers is being done by existing GPU hardware as follows:
- * 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data
- * registers.
- * 2) Work-group Id registers X, Y, Z are set by SPI which supports any
- * combination including none.
- * 3) Scratch Wave Offset is also set by SPI which is why its value cannot
- * be added into the value Flat Scratch Offset which would avoid the
- * Finalizer generated prolog having to do the add.
- * 4) The VGPRs are set by SPI which only supports specifying either (X),
- * (X, Y) or (X, Y, Z).
- *
- * Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so
- * they can be moved as a 64 bit value to the hardware required SGPRn-3 and
- * SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.
- *
- * The global segment can be accessed either using flat operations or buffer
- * operations. If buffer operations are used then the Global Buffer used to
- * access HSAIL Global/Readonly/Kernarg (which are combine) segments using a
- * segment address is not passed into the kernel code by CP since its base
- * address is always 0. Instead the Finalizer generates prolog code to
- * initialize 4 SGPRs with a V# that has the following properties, and then
- * uses that in the buffer instructions:
- * - base address of 0
- * - no swizzle
- * - ATC=1
- * - MTYPE set to support memory coherence specified in
- * amd_kernel_code_t.globalMemoryCoherence
- *
- * When the Global Buffer is used to access the Kernarg segment, must add the
- * dispatch packet kernArgPtr to a kernarg segment address before using this V#.
- * Alternatively scalar loads can be used if the kernarg offset is uniform, as
- * the kernarg segment is constant for the duration of the kernel execution.
- */
-
-typedef struct amd_kernel_code_s {
- uint32_t amd_kernel_code_version_major;
- uint32_t amd_kernel_code_version_minor;
- uint16_t amd_machine_kind;
- uint16_t amd_machine_version_major;
- uint16_t amd_machine_version_minor;
- uint16_t amd_machine_version_stepping;
-
- /* Byte offset (possibly negative) from start of amd_kernel_code_t
- * object to kernel's entry point instruction. The actual code for
- * the kernel is required to be 256 byte aligned to match hardware
- * requirements (SQ cache line is 16). The code must be position
- * independent code (PIC) for AMD devices to give runtime the
- * option of copying code to discrete GPU memory or APU L2
- * cache. The Finalizer should endeavour to allocate all kernel
- * machine code in contiguous memory pages so that a device
- * pre-fetcher will tend to only pre-fetch Kernel Code objects,
- * improving cache performance.
- */
- int64_t kernel_code_entry_byte_offset;
-
- /* Range of bytes to consider prefetching expressed as an offset
- * and size. The offset is from the start (possibly negative) of
- * amd_kernel_code_t object. Set both to 0 if no prefetch
- * information is available.
- */
- int64_t kernel_code_prefetch_byte_offset;
- uint64_t kernel_code_prefetch_byte_size;
-
- /* Number of bytes of scratch backing memory required for full
- * occupancy of target chip. This takes into account the number of
- * bytes of scratch per work-item, the wavefront size, the maximum
- * number of wavefronts per CU, and the number of CUs. This is an
- * upper limit on scratch. If the grid being dispatched is small it
- * may only need less than this. If the kernel uses no scratch, or
- * the Finalizer has not computed this value, it must be 0.
- */
- uint64_t max_scratch_backing_memory_byte_size;
-
- /* Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
- * COMPUTE_PGM_RSRC2 registers.
- */
- uint64_t compute_pgm_resource_registers;
-
- /* Code properties. See amd_code_property_mask_t for a full list of
- * properties.
- */
- uint32_t code_properties;
-
- /* The amount of memory required for the combined private, spill
- * and arg segments for a work-item in bytes. If
- * is_dynamic_callstack is 1 then additional space must be added to
- * this value for the call stack.
- */
- uint32_t workitem_private_segment_byte_size;
-
- /* The amount of group segment memory required by a work-group in
- * bytes. This does not include any dynamically allocated group
- * segment memory that may be added when the kernel is
- * dispatched.
- */
- uint32_t workgroup_group_segment_byte_size;
-
- /* Number of byte of GDS required by kernel dispatch. Must be 0 if
- * not using GDS.
- */
- uint32_t gds_segment_byte_size;
-
- /* The size in bytes of the kernarg segment that holds the values
- * of the arguments to the kernel. This could be used by CP to
- * prefetch the kernarg segment pointed to by the dispatch packet.
- */
- uint64_t kernarg_segment_byte_size;
-
- /* Number of fbarrier's used in the kernel and all functions it
- * calls. If the implementation uses group memory to allocate the
- * fbarriers then that amount must already be included in the
- * workgroup_group_segment_byte_size total.
- */
- uint32_t workgroup_fbarrier_count;
-
- /* Number of scalar registers used by a wavefront. This includes
- * the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
- * and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
- * trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
- */
- uint16_t wavefront_sgpr_count;
-
- /* Number of vector registers used by each work-item. Used to set
- * COMPUTE_PGM_RSRC1.VGPRS.
- */
- uint16_t workitem_vgpr_count;
-
- /* If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
- * first fixed VGPR number reserved.
- */
- uint16_t reserved_vgpr_first;
-
- /* The number of consecutive VGPRs reserved by the client. If
- * is_debug_supported then this count includes VGPRs reserved
- * for debugger use.
- */
- uint16_t reserved_vgpr_count;
-
- /* If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
- * first fixed SGPR number reserved.
- */
- uint16_t reserved_sgpr_first;
-
- /* The number of consecutive SGPRs reserved by the client. If
- * is_debug_supported then this count includes SGPRs reserved
- * for debugger use.
- */
- uint16_t reserved_sgpr_count;
-
- /* If is_debug_supported is 0 then must be 0. Otherwise, this is the
- * fixed SGPR number used to hold the wave scratch offset for the
- * entire kernel execution, or uint16_t(-1) if the register is not
- * used or not known.
- */
- uint16_t debug_wavefront_private_segment_offset_sgpr;
-
- /* If is_debug_supported is 0 then must be 0. Otherwise, this is the
- * fixed SGPR number of the first of 4 SGPRs used to hold the
- * scratch V# used for the entire kernel execution, or uint16_t(-1)
- * if the registers are not used or not known.
- */
- uint16_t debug_private_segment_buffer_sgpr;
-
- /* The maximum byte alignment of variables used by the kernel in
- * the specified memory segment. Expressed as a power of two. Must
- * be at least HSA_POWERTWO_16.
- */
- uint8_t kernarg_segment_alignment;
- uint8_t group_segment_alignment;
- uint8_t private_segment_alignment;
-
- /* Wavefront size expressed as a power of two. Must be a power of 2
- * in range 1..64 inclusive. Used to support runtime query that
- * obtains wavefront size, which may be used by application to
- * allocated dynamic group memory and set the dispatch work-group
- * size.
- */
- uint8_t wavefront_size;
-
- int32_t call_convention;
- uint8_t reserved3[12];
- uint64_t runtime_loader_kernel_symbol;
- uint64_t control_directives[16];
-} amd_kernel_code_t;
-
-#endif // AMDKERNELCODET_H
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 4a395b4c859..892dd89be56 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -5,7 +5,6 @@
*/
#include "ac_rtld.h"
-#include "amd_kernel_code_t.h"
#include "nir/tgsi_to_nir.h"
#include "si_build_pm4.h"
#include "si_shader_internal.h"
@@ -21,71 +20,6 @@
fprintf(stderr, fmt, ##args); \
} while (0);
-struct dispatch_packet {
- uint16_t header;
- uint16_t setup;
- uint16_t workgroup_size_x;
- uint16_t workgroup_size_y;
- uint16_t workgroup_size_z;
- uint16_t reserved0;
- uint32_t grid_size_x;
- uint32_t grid_size_y;
- uint32_t grid_size_z;
- uint32_t group_segment_size;
- uint64_t kernel_object;
- uint64_t kernarg_address;
- uint64_t reserved2;
-};
-
-static const amd_kernel_code_t *si_compute_get_code_object(const struct si_compute *program,
- uint64_t symbol_offset)
-{
- const struct si_shader_selector *sel = &program->sel;
-
- if (program->ir_type != PIPE_SHADER_IR_NATIVE)
- return NULL;
-
- struct ac_rtld_binary rtld;
- if (!ac_rtld_open(&rtld,
- (struct ac_rtld_open_info){.info = &sel->screen->info,
- .shader_type = MESA_SHADER_COMPUTE,
- .num_parts = 1,
- .elf_ptrs = &program->shader.binary.code_buffer,
- .elf_sizes = &program->shader.binary.code_size}))
- return NULL;
-
- const amd_kernel_code_t *result = NULL;
- const char *text;
- size_t size;
- if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size))
- goto out;
-
- if (symbol_offset + sizeof(amd_kernel_code_t) > size)
- goto out;
-
- result = (const amd_kernel_code_t *)(text + symbol_offset);
-
-out:
- ac_rtld_close(&rtld);
- return result;
-}
-
-static void code_object_to_config(const amd_kernel_code_t *code_object,
- struct ac_shader_config *out_config)
-{
-
- uint32_t rsrc1 = code_object->compute_pgm_resource_registers;
- uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32;
- out_config->num_sgprs = code_object->wavefront_sgpr_count;
- out_config->num_vgprs = code_object->workitem_vgpr_count;
- out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1);
- out_config->rsrc1 = rsrc1;
- out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2));
- out_config->rsrc2 = rsrc2;
- out_config->scratch_bytes_per_wave =
- align(code_object->workitem_private_segment_byte_size * 64, 1024);
-}
-
/* Asynchronous compute shader compilation. */
static void si_create_compute_state_async(void *job, void *gdata, int thread_index)
{
@@ -101,7 +35,6 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind
assert(thread_index < ARRAY_SIZE(sscreen->compiler));
compiler = &sscreen->compiler[thread_index];
- assert(program->ir_type == PIPE_SHADER_IR_NIR);
si_nir_scan_shader(sscreen, sel->nir, &sel->info, false);
if (!sel->nir->info.use_aco_amd && !*compiler)
@@ -223,6 +156,10 @@ static void *si_create_compute_state(struct pipe_context *ctx, const struct pipe
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_screen *sscreen = (struct si_screen *)ctx->screen;
+
+ if (cso->ir_type == PIPE_SHADER_IR_NATIVE)
+ return NULL;
+
struct si_compute *program = CALLOC_STRUCT(si_compute);
struct si_shader_selector *sel = &program->sel;
@@ -236,61 +173,25 @@ static void *si_create_compute_state(struct pipe_context *ctx, const struct pipe
si_sampler_and_image_descriptors_idx(PIPE_SHADER_COMPUTE);
sel->info.base.shared_size = cso->static_shared_mem;
program->shader.selector = &program->sel;
- program->ir_type = cso->ir_type;
- program->input_size = cso->req_input_mem;
-
- if (cso->ir_type != PIPE_SHADER_IR_NATIVE) {
- if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
- program->ir_type = PIPE_SHADER_IR_NIR;
- sel->nir = tgsi_to_nir(cso->prog, ctx->screen, true);
- } else {
- assert(cso->ir_type == PIPE_SHADER_IR_NIR);
- sel->nir = (struct nir_shader *)cso->prog;
- }
- sel->nir->info.shared_size = cso->static_shared_mem;
-
- if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_INIT_NIR))
- nir_print_shader(sel->nir, stderr);
-
- sel->compiler_ctx_state.debug = sctx->debug;
- sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
- p_atomic_inc(&sscreen->num_shaders_created);
-
- si_schedule_initial_compile(sctx, MESA_SHADER_COMPUTE, &sel->ready, &sel->compiler_ctx_state,
- program, si_create_compute_state_async);
+ if (cso->ir_type == PIPE_SHADER_IR_TGSI) {
+ sel->nir = tgsi_to_nir(cso->prog, ctx->screen, true);
} else {
- const struct pipe_binary_program_header *header;
- header = cso->prog;
-
- program->shader.binary.type = SI_SHADER_BINARY_ELF;
- program->shader.binary.code_size = header->num_bytes;
- program->shader.binary.code_buffer = malloc(header->num_bytes);
- if (!program->shader.binary.code_buffer) {
- FREE(program);
- return NULL;
- }
- memcpy((void *)program->shader.binary.code_buffer, header->blob, header->num_bytes);
-
- const amd_kernel_code_t *code_object = si_compute_get_code_object(program, 0);
- code_object_to_config(code_object, &program->shader.config);
+ assert(cso->ir_type == PIPE_SHADER_IR_NIR);
+ sel->nir = (struct nir_shader *)cso->prog;
+ }
- if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32))
- program->shader.wave_size = 32;
- else
- program->shader.wave_size = 64;
+ sel->nir->info.shared_size = cso->static_shared_mem;
- bool ok = si_shader_binary_upload(sctx->screen, &program->shader, 0);
- si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true);
+ if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_INIT_NIR))
+ nir_print_shader(sel->nir, stderr);
- if (!ok) {
- fprintf(stderr, "LLVM failed to upload shader\n");
- free((void *)program->shader.binary.code_buffer);
- FREE(program);
- return NULL;
- }
- }
+ sel->compiler_ctx_state.debug = sctx->debug;
+ sel->compiler_ctx_state.is_debug_context = sctx->is_debug;
+ p_atomic_inc(&sscreen->num_shaders_created);
+ si_schedule_initial_compile(sctx, MESA_SHADER_COMPUTE, &sel->ready, &sel->compiler_ctx_state,
+ program, si_create_compute_state_async);
return program;
}
@@ -300,8 +201,6 @@ static void si_get_compute_state_info(struct pipe_context *ctx, void *state,
struct si_compute *program = (struct si_compute *)state;
struct si_shader_selector *sel = &program->sel;
- assert(program->ir_type != PIPE_SHADER_IR_NATIVE);
-
/* Wait because we need the compilation to finish first */
util_queue_fence_wait(&sel->ready);
@@ -323,8 +222,7 @@ static void si_bind_compute_state(struct pipe_context *ctx, void *state)
return;
/* Wait because we need active slot usage masks. */
- if (program->ir_type != PIPE_SHADER_IR_NATIVE)
- util_queue_fence_wait(&sel->ready);
+ util_queue_fence_wait(&sel->ready);
si_set_active_descriptors(sctx,
SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
@@ -434,51 +332,29 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_s
}
static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute *program,
- struct si_shader *shader, const amd_kernel_code_t *code_object,
- unsigned offset, bool *prefetch, unsigned variable_shared_size)
+ struct si_shader *shader, unsigned offset, bool *prefetch,
+ unsigned variable_shared_size)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
- struct ac_shader_config inline_config = {0};
- const struct ac_shader_config *config;
+ const struct ac_shader_config *config = &shader->config;
unsigned rsrc2;
- uint64_t shader_va;
unsigned stage = shader->selector->info.base.stage;
*prefetch = false;
- assert(variable_shared_size == 0 || stage == MESA_SHADER_KERNEL || program->ir_type == PIPE_SHADER_IR_NATIVE);
+ assert(variable_shared_size == 0 || stage == MESA_SHADER_KERNEL);
if (sctx->cs_shader_state.emitted_program == program && sctx->cs_shader_state.offset == offset &&
sctx->cs_shader_state.variable_shared_size == variable_shared_size)
return true;
- if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
- config = &shader->config;
- } else {
- code_object_to_config(code_object, &inline_config);
- config = &inline_config;
- }
/* copy rsrc2 so we don't have to change it inside the si_shader object */
rsrc2 = config->rsrc2;
/* only do this for OpenCL */
- if (program->ir_type == PIPE_SHADER_IR_NATIVE || stage == MESA_SHADER_KERNEL) {
+ if (stage == MESA_SHADER_KERNEL) {
unsigned shared_size = program->sel.info.base.shared_size + variable_shared_size;
- unsigned lds_blocks;
-
- /* Clover uses the compute API differently than other frontends and expects drivers to parse
- * the shared_size out of the shader headers.
- */
- if (program->ir_type == PIPE_SHADER_IR_NATIVE) {
- lds_blocks = config->lds_size;
- } else {
- lds_blocks = 0;
- }
+ unsigned lds_blocks = 0;
- /* XXX: We are over allocating LDS. For GFX6, the shader reports
- * LDS in blocks of 256 bytes, so if there are 4 bytes lds
- * allocated in the shader and 4 bytes allocated by the state
- * tracker, then we will set LDS_SIZE to 512 bytes rather than 256.
- */
if (sctx->gfx_level <= GFX6) {
lds_blocks += align(shared_size, 256) >> 8;
} else {
@@ -516,12 +392,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
RADEON_USAGE_READWRITE | RADEON_PRIO_SCRATCH_BUFFER);
}
- shader_va = shader->bo->gpu_address + offset;
- if (program->ir_type == PIPE_SHADER_IR_NATIVE) {
- /* Shader code is placed after the amd_kernel_code_t
- * struct. */
- shader_va += sizeof(amd_kernel_code_t);
- }
+ uint64_t shader_va = shader->bo->gpu_address + offset;
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->bo,
RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY);
@@ -604,165 +475,6 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
return true;
}
-static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
- const amd_kernel_code_t *code_object, unsigned user_sgpr)
-{
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
- uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
-
- unsigned max_private_element_size =
- AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE);
-
- uint32_t scratch_dword0 = scratch_va & 0xffffffff;
- uint32_t scratch_dword1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
-
- if (sctx->gfx_level >= GFX11)
- scratch_dword1 |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
- else
- scratch_dword1 |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
-
- /* Disable address clamping */
- uint32_t scratch_dword2 = 0xffffffff;
- uint32_t index_stride = sctx->cs_shader_state.program->shader.wave_size == 32 ? 2 : 3;
- uint32_t scratch_dword3 = S_008F0C_INDEX_STRIDE(index_stride) | S_008F0C_ADD_TID_ENABLE(1);
-
- if (sctx->gfx_level >= GFX9) {
- assert(max_private_element_size == 1); /* only 4 bytes on GFX9 */
- } else {
- scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size);
-
- if (sctx->gfx_level < GFX8) {
- /* BUF_DATA_FORMAT is ignored, but it cannot be
- * BUF_DATA_FORMAT_INVALID. */
- scratch_dword3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8);
- }
- }
-
- radeon_begin(cs);
- radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
- radeon_emit(scratch_dword0);
- radeon_emit(scratch_dword1);
- radeon_emit(scratch_dword2);
- radeon_emit(scratch_dword3);
- radeon_end();
-}
-
-static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_code_t *code_object,
- const struct pipe_grid_info *info, uint64_t kernel_args_va)
-{
- struct si_compute *program = sctx->cs_shader_state.program;
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-
- static const enum amd_code_property_mask_t workgroup_count_masks[] = {
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y,
- AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z};
-
- unsigned i, user_sgpr = 0;
- if (AMD_HSA_BITS_GET(code_object->code_properties,
- AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) {
- if (code_object->workitem_private_segment_byte_size > 0) {
- setup_scratch_rsrc_user_sgprs(sctx, code_object, user_sgpr);
- }
- user_sgpr += 4;
- }
-
- radeon_begin(cs);
-
- if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
- struct dispatch_packet dispatch;
- unsigned dispatch_offset;
- struct si_resource *dispatch_buf = NULL;
- uint64_t dispatch_va;
-
- /* Upload dispatch ptr */
- memset(&dispatch, 0, sizeof(dispatch));
-
- dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]);
- dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]);
- dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]);
-
- dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]);
- dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]);
- dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]);
-
- dispatch.group_segment_size =
- util_cpu_to_le32(program->sel.info.base.shared_size + info->variable_shared_mem);
-
- dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va);
-
- u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch), 256, &dispatch, &dispatch_offset,
- (struct pipe_resource **)&dispatch_buf);
-
- if (!dispatch_buf) {
- fprintf(stderr, "Error: Failed to allocate dispatch "
- "packet.");
- }
- radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, dispatch_buf,
- RADEON_USAGE_READ | RADEON_PRIO_CONST_BUFFER);
-
- dispatch_va = dispatch_buf->gpu_address + dispatch_offset;
-
- radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
- radeon_emit(dispatch_va);
- radeon_emit(S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0));
-
- si_resource_reference(&dispatch_buf, NULL);
- user_sgpr += 2;
- }
-
- if (AMD_HSA_BITS_GET(code_object->code_properties,
- AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) {
- radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2);
- radeon_emit(kernel_args_va);
- radeon_emit(S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0));
- user_sgpr += 2;
- }
-
- for (i = 0; i < 3 && user_sgpr < 16; i++) {
- if (code_object->code_properties & workgroup_count_masks[i]) {
- radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1);
- radeon_emit(info->grid[i]);
- user_sgpr += 1;
- }
- }
- radeon_end();
-}
-
-static bool si_upload_compute_input(struct si_context *sctx, const amd_kernel_code_t *code_object,
- const struct pipe_grid_info *info)
-{
- struct si_compute *program = sctx->cs_shader_state.program;
- struct si_resource *input_buffer = NULL;
- uint32_t kernel_args_offset = 0;
- uint32_t *kernel_args;
- void *kernel_args_ptr;
- uint64_t kernel_args_va;
-
- u_upload_alloc(sctx->b.const_uploader, 0, program->input_size,
- sctx->screen->info.tcc_cache_line_size, &kernel_args_offset,
- (struct pipe_resource **)&input_buffer, &kernel_args_ptr);
-
- if (unlikely(!kernel_args_ptr))
- return false;
-
- kernel_args = (uint32_t *)kernel_args_ptr;
- kernel_args_va = input_buffer->gpu_address + kernel_args_offset;
-
- memcpy(kernel_args, info->input, program->input_size);
-
- for (unsigned i = 0; i < program->input_size / 4; i++) {
- COMPUTE_DBG(sctx->screen, "input %u : %u\n", i, kernel_args[i]);
- }
-
- radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, input_buffer,
- RADEON_USAGE_READ | RADEON_PRIO_CONST_BUFFER);
-
- si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va);
- si_resource_reference(&input_buffer, NULL);
- return true;
-}
-
static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_grid_info *info)
{
struct si_compute *program = sctx->cs_shader_state.program;
@@ -1174,19 +886,17 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
struct si_context *sctx = (struct si_context *)ctx;
struct si_screen *sscreen = sctx->screen;
struct si_compute *program = sctx->cs_shader_state.program;
- const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc);
- int i;
+
+ if (program->shader.compilation_failed)
+ return;
+
bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug &&
info->block[0] * info->block[1] * info->block[2] > 256;
-
if (cs_regalloc_hang) {
sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS;
si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
}
- if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed)
- return;
-
si_check_dirty_buffers_textures(sctx);
if (sctx->has_graphics) {
@@ -1251,18 +961,14 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
/* First emit registers. */
bool prefetch;
- if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc, &prefetch,
+ if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc, &prefetch,
info->variable_shared_mem))
return;
si_emit_compute_shader_pointers(sctx);
- if (program->ir_type == PIPE_SHADER_IR_NATIVE &&
- unlikely(!si_upload_compute_input(sctx, code_object, info)))
- return;
-
/* Global buffers */
- for (i = 0; i < sctx->max_global_buffers; i++) {
+ for (unsigned i = 0; i < sctx->max_global_buffers; i++) {
struct si_resource *buffer = si_resource(sctx->global_buffers[i]);
if (!buffer) {
continue;
@@ -1283,8 +989,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
if (sctx->gfx_level >= GFX7 && sctx->screen->info.has_cp_dma && prefetch)
si_cp_dma_prefetch(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0);
- if (program->ir_type != PIPE_SHADER_IR_NATIVE)
- si_setup_nir_user_data(sctx, info);
+ si_setup_nir_user_data(sctx, info);
si_emit_dispatch_packets(sctx, info);
@@ -1323,10 +1028,8 @@ void si_destroy_compute(struct si_compute *program)
{
struct si_shader_selector *sel = &program->sel;
- if (program->ir_type != PIPE_SHADER_IR_NATIVE) {
- util_queue_drop_job(&sel->screen->shader_compiler_queue, &sel->ready);
- util_queue_fence_destroy(&sel->ready);
- }
+ util_queue_drop_job(&sel->screen->shader_compiler_queue, &sel->ready);
+ util_queue_fence_destroy(&sel->ready);
si_shader_destroy(&program->shader);
ralloc_free(program->sel.nir);
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index 01606465a35..3eca3d8aad8 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -942,8 +942,6 @@ void si_init_shader_caps(struct si_screen *sscreen)
caps->max_shader_images = SI_NUM_IMAGES;
caps->supported_irs = (1 << PIPE_SHADER_IR_TGSI) | (1 << PIPE_SHADER_IR_NIR);
- if (i == PIPE_SHADER_COMPUTE)
- caps->supported_irs |= 1 << PIPE_SHADER_IR_NATIVE;
/* Supported boolean features. */
caps->cont_supported = true;
@@ -984,12 +982,7 @@ void si_init_compute_caps(struct si_screen *sscreen)
caps->max_block_size[1] =
caps->max_block_size[2] = 1024;
- caps->max_block_size_clover[0] =
- caps->max_block_size_clover[1] =
- caps->max_block_size_clover[2] = 256;
-
caps->max_threads_per_block = 1024;
- caps->max_threads_per_block_clover = 256;
caps->address_bits = 64;
/* Return 1/4 of the heap size as the maximum because the max size is not practically
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index fb0ffc6f42b..1a69bd45f89 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -828,10 +828,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
}
if (sctx->gfx_level == GFX7) {
- /* Clear the NULL constant buffer, because loads should return zeros.
- * Note that this forces CP DMA to be used, because clover deadlocks
- * for some reason when the compute codepath is used.
- */
+ /* Clear the NULL constant buffer, because loads should return zeros. */
uint32_t clear_value = 0;
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, sctx->null_const_buf.buffer, 0,
sctx->null_const_buf.buffer->width0, clear_value);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index db0750fb59e..a9207b3e20a 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -673,9 +673,6 @@ struct si_screen {
struct si_compute {
struct si_shader_selector sel;
struct si_shader shader;
-
- unsigned ir_type;
- unsigned input_size;
};
struct si_sampler_view {