diff options
| author | Marek Olšák <marek.olsak@amd.com> | 2025-03-26 10:19:40 -0400 |
|---|---|---|
| committer | Marge Bot <emma+marge@anholt.net> | 2025-03-27 01:59:19 +0000 |
| commit | 219b2cde13e52f1248b475cb1d7cee453f1a0c65 (patch) | |
| tree | 0a031f9302bf86fa065519735d490341241bcd20 | |
| parent | 77fb09c8cdaf2b5fb48d7f76a03798fa4286ad1b (diff) | |
radeonsi: remove clover support
Only Rusticl is supported.
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34215>
| -rw-r--r-- | src/amd/common/amd_kernel_code_t.h | 553 | ||||
| -rw-r--r-- | src/gallium/drivers/radeonsi/si_compute.c | 365 | ||||
| -rw-r--r-- | src/gallium/drivers/radeonsi/si_get.c | 7 | ||||
| -rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.c | 5 | ||||
| -rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 3 |
5 files changed, 35 insertions, 898 deletions
diff --git a/src/amd/common/amd_kernel_code_t.h b/src/amd/common/amd_kernel_code_t.h deleted file mode 100644 index 05767d5153b..00000000000 --- a/src/amd/common/amd_kernel_code_t.h +++ /dev/null @@ -1,553 +0,0 @@ -/* - * Copyright 2015,2016 Advanced Micro Devices, Inc. - * - * SPDX-License-Identifier: MIT - */ - -#ifndef AMDKERNELCODET_H -#define AMDKERNELCODET_H - -//---------------------------------------------------------------------------// -// AMD Kernel Code, and its dependencies // -//---------------------------------------------------------------------------// - -// Sets val bits for specified mask in specified dst packed instance. -#define AMD_HSA_BITS_SET(dst, mask, val) \ - dst &= (~(1 << mask##_SHIFT) & ~mask); \ - dst |= (((val) << mask##_SHIFT) & mask) - -// Gets bits for specified mask from specified src packed instance. -#define AMD_HSA_BITS_GET(src, mask) ((src & mask) >> mask##_SHIFT) - -/* Every amd_*_code_t has the following properties, which are composed of - * a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), - * bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount - * (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0. - * - * (Note that bit fields cannot be used as their layout is - * implementation defined in the C standard and so cannot be used to - * specify an ABI) - */ -enum amd_code_property_mask_t -{ - - /* Enable the setup of the SGPR user data registers - * (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t - * for initial register state. - * - * The total number of SGPRuser data registers requested must not - * exceed 16. Any requests beyond 16 will be ignored. - * - * Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of - * SGPR user data registers enabled up to 16). - */ - - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = - ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = - ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = - ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = - ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = - ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = - ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = - ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = - ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = - ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = - ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT = 10, - AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32 = - ((1 << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, - - AMD_CODE_PROPERTY_RESERVED1_SHIFT = 11, - AMD_CODE_PROPERTY_RESERVED1_WIDTH = 5, - AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) - << AMD_CODE_PROPERTY_RESERVED1_SHIFT, - - /* Control wave ID base counter for GDS ordered-append. Used to set - * COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if - * ORDERED_APPEND_MODE also needs to be settable) - */ - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16, - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = - ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) - << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, - - /* The interleave (swizzle) element size in bytes required by the - * code for private memory. This must be 2, 4, 8 or 16. This value - * is provided to the finalizer when it is invoked and is recorded - * here. The hardware will interleave the memory requests of each - * lane of a wavefront by this element size to ensure each - * work-item gets a distinct memory memory location. Therefore, the - * finalizer ensures that all load and store operations done to - * private memory do not exceed this size. For example, if the - * element size is 4 (32-bits or dword) and a 64-bit value must be - * loaded, the finalizer will generate two 32-bit loads. This - * ensures that the interleaving will get the work-item - * specific dword for both halves of the 64-bit value. If it just - * did a 64-bit load then it would get one dword which belonged to - * its own work-item, but the second dword would belong to the - * adjacent lane work-item since the interleaving is in dwords. - * - * The value used must match the value that the runtime configures - * the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This - * is generally DWORD. - * - * USE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM. - */ - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17, - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = - ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) - << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, - - /* Are global memory addresses 64 bits. Must match - * amd_kernel_code_t.hsail_machine_model == - * HSA_MACHINE_LARGE. Must also match - * SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), - * SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). - */ - AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19, - AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, - AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) - << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, - - /* Indicate if the generated ISA is using a dynamically sized call - * stack. This can happen if calls are implemented using a call - * stack and recursion, alloca or calls to indirect functions are - * present. In these cases the Finalizer cannot compute the total - * private segment size at compile time. In this case the - * workitem_private_segment_byte_size only specifies the statically - * know private segment size, and additional space must be added - * for the call stack. - */ - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20, - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = - ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) - << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, - - /* Indicate if code generated has support for debugging. */ - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21, - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) - << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT, - - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22, - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1, - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) - << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT, - - AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23, - AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9, - AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) - << AMD_CODE_PROPERTY_RESERVED2_SHIFT -}; - -/* AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel - * Code Object to set up the hardware to execute the kernel dispatch. - * - * Initial Kernel Register State. - * - * Initial kernel register state will be set up by CP/SPI prior to the start - * of execution of every wavefront. This is limited by the constraints of the - * current hardware. - * - * The order of the SGPR registers is defined, but the Finalizer can specify - * which ones are actually setup in the amd_kernel_code_t object using the - * enable_sgpr_* bit fields. The register numbers used for enabled registers - * are dense starting at SGPR0: the first enabled register is SGPR0, the next - * enabled register is SGPR1 etc.; disabled registers do not have an SGPR - * number. - * - * The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and - * apply to all waves of the grid. It is possible to specify more than 16 User - * SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 - * are actually initialized. These are then immediately followed by the System - * SGPRs that are set up by ADC/SPI and can have different values for each wave - * of the grid dispatch. - * - * SGPR register initial state is defined as follows: - * - * Private Segment Buffer (enable_sgpr_private_segment_buffer): - * Number of User SGPR registers: 4. V# that can be used, together with - * Scratch Wave Offset as an offset, to access the Private/Spill/Arg - * segments using a segment address. It must be set as follows: - * - Base address: of the scratch memory area used by the dispatch. It - * does not include the scratch wave offset. It will be the per process - * SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for - * example there may be a per pipe offset, or per AQL Queue offset). - * - Stride + data_format: Element Size * Index Stride (???) - * - Cache swizzle: ??? - * - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for - * scratch) - * - Num records: Flat Scratch Work Item Size / Element Size (???) - * - Dst_sel_*: ??? - * - Num_format: ??? - * - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must - * agree with amd_kernel_code_t.privateElementSize) - * - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must - * be number of wavefront lanes for scratch, must agree with - * amd_kernel_code_t.wavefrontSize) - * - Add tid enable: 1 - * - ATC: from SH_MEM_CONFIG.PRIVATE_ATC, - * - Hash_enable: ??? - * - Heap: ??? - * - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE - * - Type: 0 (a buffer) (???) - * - * Dispatch Ptr (enable_sgpr_dispatch_ptr): - * Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet - * for kernel actually executing. - * - * Queue Ptr (enable_sgpr_queue_ptr): - * Number of User SGPR registers: 2. 64 bit address of AmdQueue object for - * AQL queue on which the dispatch packet was queued. - * - * Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): - * Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This - * is directly copied from the kernargPtr in the dispatch packet. Having CP - * load it once avoids loading it at the beginning of every wavefront. - * - * Dispatch Id (enable_sgpr_dispatch_id): - * Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch - * packet being executed. - * - * Flat Scratch Init (enable_sgpr_flat_scratch_init): - * Number of User SGPR registers: 2. This is 2 SGPRs. - * - * For CI/VI: - * The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE - * to base of memory for scratch for this dispatch. This is the same offset - * used in computing the Scratch Segment Buffer base address. The value of - * Scratch Wave Offset must be added by the kernel code and moved to - * SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions. - * - * The second SGPR is 32 bit byte size of a single work-item's scratch - * memory usage. This is directly loaded from the dispatch packet Private - * Segment Byte Size and rounded up to a multiple of DWORD. - * - * \todo [Does CP need to round this to >4 byte alignment?] - * - * The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in - * flat memory instructions. Having CP load it once avoids loading it at - * the beginning of every wavefront. - * - * Private Segment Size (enable_sgpr_private_segment_size): - * Number of User SGPR registers: 1. The 32 bit byte size of a single - * work-item's scratch memory allocation. This is the value from the dispatch - * packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD. - * - * \todo [Does CP need to round this to >4 byte alignment?] - * - * Having CP load it once avoids loading it at the beginning of every - * wavefront. - * - * \todo [This will not be used for CI/VI since it is the same value as - * the second SGPR of Flat Scratch Init. - * - * Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): - * Number of User SGPR registers: 1. 32 bit count of the number of - * work-groups in the X dimension for the grid being executed. Computed from - * the fields in the HsaDispatchPacket as - * ((gridSize.x+workgroupSize.x-1)/workgroupSize.x). - * - * Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): - * Number of User SGPR registers: 1. 32 bit count of the number of - * work-groups in the Y dimension for the grid being executed. Computed from - * the fields in the HsaDispatchPacket as - * ((gridSize.y+workgroupSize.y-1)/workgroupSize.y). - * - * Only initialized if <16 previous SGPRs initialized. - * - * Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): - * Number of User SGPR registers: 1. 32 bit count of the number of - * work-groups in the Z dimension for the grid being executed. Computed - * from the fields in the HsaDispatchPacket as - * ((gridSize.z+workgroupSize.z-1)/workgroupSize.z). - * - * Only initialized if <16 previous SGPRs initialized. - * - * Work-Group Id X (enable_sgpr_workgroup_id_x): - * Number of System SGPR registers: 1. 32 bit work group id in X dimension - * of grid for wavefront. Always present. - * - * Work-Group Id Y (enable_sgpr_workgroup_id_y): - * Number of System SGPR registers: 1. 32 bit work group id in Y dimension - * of grid for wavefront. - * - * Work-Group Id Z (enable_sgpr_workgroup_id_z): - * Number of System SGPR registers: 1. 32 bit work group id in Z dimension - * of grid for wavefront. If present then Work-group Id Y will also be - * present - * - * Work-Group Info (enable_sgpr_workgroup_info): - * Number of System SGPR registers: 1. {first_wave, 14'b0000, - * ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} - * - * Private Segment Wave Byte Offset - * (enable_sgpr_private_segment_wave_byte_offset): - * Number of System SGPR registers: 1. 32 bit byte offset from base of - * dispatch scratch base. Must be used as an offset with Private/Spill/Arg - * segment address when using Scratch Segment Buffer. It must be added to - * Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. - * - * - * The order of the VGPR registers is defined, but the Finalizer can specify - * which ones are actually setup in the amd_kernel_code_t object using the - * enableVgpr* bit fields. The register numbers used for enabled registers - * are dense starting at VGPR0: the first enabled register is VGPR0, the next - * enabled register is VGPR1 etc.; disabled registers do not have an VGPR - * number. - * - * VGPR register initial state is defined as follows: - * - * Work-Item Id X (always initialized): - * Number of registers: 1. 32 bit work item id in X dimension of work-group - * for wavefront lane. - * - * Work-Item Id X (enable_vgpr_workitem_id > 0): - * Number of registers: 1. 32 bit work item id in Y dimension of work-group - * for wavefront lane. - * - * Work-Item Id X (enable_vgpr_workitem_id > 0): - * Number of registers: 1. 32 bit work item id in Z dimension of work-group - * for wavefront lane. - * - * - * The setting of registers is being done by existing GPU hardware as follows: - * 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data - * registers. - * 2) Work-group Id registers X, Y, Z are set by SPI which supports any - * combination including none. - * 3) Scratch Wave Offset is also set by SPI which is why its value cannot - * be added into the value Flat Scratch Offset which would avoid the - * Finalizer generated prolog having to do the add. - * 4) The VGPRs are set by SPI which only supports specifying either (X), - * (X, Y) or (X, Y, Z). - * - * Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so - * they can be moved as a 64 bit value to the hardware required SGPRn-3 and - * SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register. - * - * The global segment can be accessed either using flat operations or buffer - * operations. If buffer operations are used then the Global Buffer used to - * access HSAIL Global/Readonly/Kernarg (which are combine) segments using a - * segment address is not passed into the kernel code by CP since its base - * address is always 0. Instead the Finalizer generates prolog code to - * initialize 4 SGPRs with a V# that has the following properties, and then - * uses that in the buffer instructions: - * - base address of 0 - * - no swizzle - * - ATC=1 - * - MTYPE set to support memory coherence specified in - * amd_kernel_code_t.globalMemoryCoherence - * - * When the Global Buffer is used to access the Kernarg segment, must add the - * dispatch packet kernArgPtr to a kernarg segment address before using this V#. - * Alternatively scalar loads can be used if the kernarg offset is uniform, as - * the kernarg segment is constant for the duration of the kernel execution. - */ - -typedef struct amd_kernel_code_s { - uint32_t amd_kernel_code_version_major; - uint32_t amd_kernel_code_version_minor; - uint16_t amd_machine_kind; - uint16_t amd_machine_version_major; - uint16_t amd_machine_version_minor; - uint16_t amd_machine_version_stepping; - - /* Byte offset (possibly negative) from start of amd_kernel_code_t - * object to kernel's entry point instruction. The actual code for - * the kernel is required to be 256 byte aligned to match hardware - * requirements (SQ cache line is 16). The code must be position - * independent code (PIC) for AMD devices to give runtime the - * option of copying code to discrete GPU memory or APU L2 - * cache. The Finalizer should endeavour to allocate all kernel - * machine code in contiguous memory pages so that a device - * pre-fetcher will tend to only pre-fetch Kernel Code objects, - * improving cache performance. - */ - int64_t kernel_code_entry_byte_offset; - - /* Range of bytes to consider prefetching expressed as an offset - * and size. The offset is from the start (possibly negative) of - * amd_kernel_code_t object. Set both to 0 if no prefetch - * information is available. - */ - int64_t kernel_code_prefetch_byte_offset; - uint64_t kernel_code_prefetch_byte_size; - - /* Number of bytes of scratch backing memory required for full - * occupancy of target chip. This takes into account the number of - * bytes of scratch per work-item, the wavefront size, the maximum - * number of wavefronts per CU, and the number of CUs. This is an - * upper limit on scratch. If the grid being dispatched is small it - * may only need less than this. If the kernel uses no scratch, or - * the Finalizer has not computed this value, it must be 0. - */ - uint64_t max_scratch_backing_memory_byte_size; - - /* Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and - * COMPUTE_PGM_RSRC2 registers. - */ - uint64_t compute_pgm_resource_registers; - - /* Code properties. See amd_code_property_mask_t for a full list of - * properties. - */ - uint32_t code_properties; - - /* The amount of memory required for the combined private, spill - * and arg segments for a work-item in bytes. If - * is_dynamic_callstack is 1 then additional space must be added to - * this value for the call stack. - */ - uint32_t workitem_private_segment_byte_size; - - /* The amount of group segment memory required by a work-group in - * bytes. This does not include any dynamically allocated group - * segment memory that may be added when the kernel is - * dispatched. - */ - uint32_t workgroup_group_segment_byte_size; - - /* Number of byte of GDS required by kernel dispatch. Must be 0 if - * not using GDS. - */ - uint32_t gds_segment_byte_size; - - /* The size in bytes of the kernarg segment that holds the values - * of the arguments to the kernel. This could be used by CP to - * prefetch the kernarg segment pointed to by the dispatch packet. - */ - uint64_t kernarg_segment_byte_size; - - /* Number of fbarrier's used in the kernel and all functions it - * calls. If the implementation uses group memory to allocate the - * fbarriers then that amount must already be included in the - * workgroup_group_segment_byte_size total. - */ - uint32_t workgroup_fbarrier_count; - - /* Number of scalar registers used by a wavefront. This includes - * the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size - * and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a - * trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. - */ - uint16_t wavefront_sgpr_count; - - /* Number of vector registers used by each work-item. Used to set - * COMPUTE_PGM_RSRC1.VGPRS. - */ - uint16_t workitem_vgpr_count; - - /* If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the - * first fixed VGPR number reserved. - */ - uint16_t reserved_vgpr_first; - - /* The number of consecutive VGPRs reserved by the client. If - * is_debug_supported then this count includes VGPRs reserved - * for debugger use. - */ - uint16_t reserved_vgpr_count; - - /* If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the - * first fixed SGPR number reserved. - */ - uint16_t reserved_sgpr_first; - - /* The number of consecutive SGPRs reserved by the client. If - * is_debug_supported then this count includes SGPRs reserved - * for debugger use. - */ - uint16_t reserved_sgpr_count; - - /* If is_debug_supported is 0 then must be 0. Otherwise, this is the - * fixed SGPR number used to hold the wave scratch offset for the - * entire kernel execution, or uint16_t(-1) if the register is not - * used or not known. - */ - uint16_t debug_wavefront_private_segment_offset_sgpr; - - /* If is_debug_supported is 0 then must be 0. Otherwise, this is the - * fixed SGPR number of the first of 4 SGPRs used to hold the - * scratch V# used for the entire kernel execution, or uint16_t(-1) - * if the registers are not used or not known. - */ - uint16_t debug_private_segment_buffer_sgpr; - - /* The maximum byte alignment of variables used by the kernel in - * the specified memory segment. Expressed as a power of two. Must - * be at least HSA_POWERTWO_16. - */ - uint8_t kernarg_segment_alignment; - uint8_t group_segment_alignment; - uint8_t private_segment_alignment; - - /* Wavefront size expressed as a power of two. Must be a power of 2 - * in range 1..64 inclusive. Used to support runtime query that - * obtains wavefront size, which may be used by application to - * allocated dynamic group memory and set the dispatch work-group - * size. - */ - uint8_t wavefront_size; - - int32_t call_convention; - uint8_t reserved3[12]; - uint64_t runtime_loader_kernel_symbol; - uint64_t control_directives[16]; -} amd_kernel_code_t; - -#endif // AMDKERNELCODET_H diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 4a395b4c859..892dd89be56 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -5,7 +5,6 @@ */ #include "ac_rtld.h" -#include "amd_kernel_code_t.h" #include "nir/tgsi_to_nir.h" #include "si_build_pm4.h" #include "si_shader_internal.h" @@ -21,71 +20,6 @@ fprintf(stderr, fmt, ##args); \ } while (0); -struct dispatch_packet { - uint16_t header; - uint16_t setup; - uint16_t workgroup_size_x; - uint16_t workgroup_size_y; - uint16_t workgroup_size_z; - uint16_t reserved0; - uint32_t grid_size_x; - uint32_t grid_size_y; - uint32_t grid_size_z; - uint32_t group_segment_size; - uint64_t kernel_object; - uint64_t kernarg_address; - uint64_t reserved2; -}; - -static const amd_kernel_code_t *si_compute_get_code_object(const struct si_compute *program, - uint64_t symbol_offset) -{ - const struct si_shader_selector *sel = &program->sel; - - if (program->ir_type != PIPE_SHADER_IR_NATIVE) - return NULL; - - struct ac_rtld_binary rtld; - if (!ac_rtld_open(&rtld, - (struct ac_rtld_open_info){.info = &sel->screen->info, - .shader_type = MESA_SHADER_COMPUTE, - .num_parts = 1, - .elf_ptrs = &program->shader.binary.code_buffer, - .elf_sizes = &program->shader.binary.code_size})) - return NULL; - - const amd_kernel_code_t *result = NULL; - const char *text; - size_t size; - if (!ac_rtld_get_section_by_name(&rtld, ".text", &text, &size)) - goto out; - - if (symbol_offset + sizeof(amd_kernel_code_t) > size) - goto out; - - result = (const amd_kernel_code_t *)(text + symbol_offset); - -out: - ac_rtld_close(&rtld); - return result; -} - -static void code_object_to_config(const amd_kernel_code_t *code_object, - struct ac_shader_config *out_config) -{ - - uint32_t rsrc1 = code_object->compute_pgm_resource_registers; - uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32; - out_config->num_sgprs = code_object->wavefront_sgpr_count; - out_config->num_vgprs = code_object->workitem_vgpr_count; - out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1); - out_config->rsrc1 = rsrc1; - out_config->lds_size = MAX2(out_config->lds_size, G_00B84C_LDS_SIZE(rsrc2)); - out_config->rsrc2 = rsrc2; - out_config->scratch_bytes_per_wave = - align(code_object->workitem_private_segment_byte_size * 64, 1024); -} - /* Asynchronous compute shader compilation. */ static void si_create_compute_state_async(void *job, void *gdata, int thread_index) { @@ -101,7 +35,6 @@ static void si_create_compute_state_async(void *job, void *gdata, int thread_ind assert(thread_index < ARRAY_SIZE(sscreen->compiler)); compiler = &sscreen->compiler[thread_index]; - assert(program->ir_type == PIPE_SHADER_IR_NIR); si_nir_scan_shader(sscreen, sel->nir, &sel->info, false); if (!sel->nir->info.use_aco_amd && !*compiler) @@ -223,6 +156,10 @@ static void *si_create_compute_state(struct pipe_context *ctx, const struct pipe { struct si_context *sctx = (struct si_context *)ctx; struct si_screen *sscreen = (struct si_screen *)ctx->screen; + + if (cso->ir_type == PIPE_SHADER_IR_NATIVE) + return NULL; + struct si_compute *program = CALLOC_STRUCT(si_compute); struct si_shader_selector *sel = &program->sel; @@ -236,61 +173,25 @@ static void *si_create_compute_state(struct pipe_context *ctx, const struct pipe si_sampler_and_image_descriptors_idx(PIPE_SHADER_COMPUTE); sel->info.base.shared_size = cso->static_shared_mem; program->shader.selector = &program->sel; - program->ir_type = cso->ir_type; - program->input_size = cso->req_input_mem; - - if (cso->ir_type != PIPE_SHADER_IR_NATIVE) { - if (cso->ir_type == PIPE_SHADER_IR_TGSI) { - program->ir_type = PIPE_SHADER_IR_NIR; - sel->nir = tgsi_to_nir(cso->prog, ctx->screen, true); - } else { - assert(cso->ir_type == PIPE_SHADER_IR_NIR); - sel->nir = (struct nir_shader *)cso->prog; - } - sel->nir->info.shared_size = cso->static_shared_mem; - - if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_INIT_NIR)) - nir_print_shader(sel->nir, stderr); - - sel->compiler_ctx_state.debug = sctx->debug; - sel->compiler_ctx_state.is_debug_context = sctx->is_debug; - p_atomic_inc(&sscreen->num_shaders_created); - - si_schedule_initial_compile(sctx, MESA_SHADER_COMPUTE, &sel->ready, &sel->compiler_ctx_state, - program, si_create_compute_state_async); + if (cso->ir_type == PIPE_SHADER_IR_TGSI) { + sel->nir = tgsi_to_nir(cso->prog, ctx->screen, true); } else { - const struct pipe_binary_program_header *header; - header = cso->prog; - - program->shader.binary.type = SI_SHADER_BINARY_ELF; - program->shader.binary.code_size = header->num_bytes; - program->shader.binary.code_buffer = malloc(header->num_bytes); - if (!program->shader.binary.code_buffer) { - FREE(program); - return NULL; - } - memcpy((void *)program->shader.binary.code_buffer, header->blob, header->num_bytes); - - const amd_kernel_code_t *code_object = si_compute_get_code_object(program, 0); - code_object_to_config(code_object, &program->shader.config); + assert(cso->ir_type == PIPE_SHADER_IR_NIR); + sel->nir = (struct nir_shader *)cso->prog; + } - if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) - program->shader.wave_size = 32; - else - program->shader.wave_size = 64; + sel->nir->info.shared_size = cso->static_shared_mem; - bool ok = si_shader_binary_upload(sctx->screen, &program->shader, 0); - si_shader_dump(sctx->screen, &program->shader, &sctx->debug, stderr, true); + if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_INIT_NIR)) + nir_print_shader(sel->nir, stderr); - if (!ok) { - fprintf(stderr, "LLVM failed to upload shader\n"); - free((void *)program->shader.binary.code_buffer); - FREE(program); - return NULL; - } - } + sel->compiler_ctx_state.debug = sctx->debug; + sel->compiler_ctx_state.is_debug_context = sctx->is_debug; + p_atomic_inc(&sscreen->num_shaders_created); + si_schedule_initial_compile(sctx, MESA_SHADER_COMPUTE, &sel->ready, &sel->compiler_ctx_state, + program, si_create_compute_state_async); return program; } @@ -300,8 +201,6 @@ static void si_get_compute_state_info(struct pipe_context *ctx, void *state, struct si_compute *program = (struct si_compute *)state; struct si_shader_selector *sel = &program->sel; - assert(program->ir_type != PIPE_SHADER_IR_NATIVE); - /* Wait because we need the compilation to finish first */ util_queue_fence_wait(&sel->ready); @@ -323,8 +222,7 @@ static void si_bind_compute_state(struct pipe_context *ctx, void *state) return; /* Wait because we need active slot usage masks. */ - if (program->ir_type != PIPE_SHADER_IR_NATIVE) - util_queue_fence_wait(&sel->ready); + util_queue_fence_wait(&sel->ready); si_set_active_descriptors(sctx, SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, @@ -434,51 +332,29 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_s } static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute *program, - struct si_shader *shader, const amd_kernel_code_t *code_object, - unsigned offset, bool *prefetch, unsigned variable_shared_size) + struct si_shader *shader, unsigned offset, bool *prefetch, + unsigned variable_shared_size) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; - struct ac_shader_config inline_config = {0}; - const struct ac_shader_config *config; + const struct ac_shader_config *config = &shader->config; unsigned rsrc2; - uint64_t shader_va; unsigned stage = shader->selector->info.base.stage; *prefetch = false; - assert(variable_shared_size == 0 || stage == MESA_SHADER_KERNEL || program->ir_type == PIPE_SHADER_IR_NATIVE); + assert(variable_shared_size == 0 || stage == MESA_SHADER_KERNEL); if (sctx->cs_shader_state.emitted_program == program && sctx->cs_shader_state.offset == offset && sctx->cs_shader_state.variable_shared_size == variable_shared_size) return true; - if (program->ir_type != PIPE_SHADER_IR_NATIVE) { - config = &shader->config; - } else { - code_object_to_config(code_object, &inline_config); - config = &inline_config; - } /* copy rsrc2 so we don't have to change it inside the si_shader object */ rsrc2 = config->rsrc2; /* only do this for OpenCL */ - if (program->ir_type == PIPE_SHADER_IR_NATIVE || stage == MESA_SHADER_KERNEL) { + if (stage == MESA_SHADER_KERNEL) { unsigned shared_size = program->sel.info.base.shared_size + variable_shared_size; - unsigned lds_blocks; - - /* Clover uses the compute API differently than other frontends and expects drivers to parse - * the shared_size out of the shader headers. - */ - if (program->ir_type == PIPE_SHADER_IR_NATIVE) { - lds_blocks = config->lds_size; - } else { - lds_blocks = 0; - } + unsigned lds_blocks = 0; - /* XXX: We are over allocating LDS. For GFX6, the shader reports - * LDS in blocks of 256 bytes, so if there are 4 bytes lds - * allocated in the shader and 4 bytes allocated by the state - * tracker, then we will set LDS_SIZE to 512 bytes rather than 256. - */ if (sctx->gfx_level <= GFX6) { lds_blocks += align(shared_size, 256) >> 8; } else { @@ -516,12 +392,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute RADEON_USAGE_READWRITE | RADEON_PRIO_SCRATCH_BUFFER); } - shader_va = shader->bo->gpu_address + offset; - if (program->ir_type == PIPE_SHADER_IR_NATIVE) { - /* Shader code is placed after the amd_kernel_code_t - * struct. */ - shader_va += sizeof(amd_kernel_code_t); - } + uint64_t shader_va = shader->bo->gpu_address + offset; radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->bo, RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY); @@ -604,165 +475,6 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute return true; } -static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx, - const amd_kernel_code_t *code_object, unsigned user_sgpr) -{ - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; - - unsigned max_private_element_size = - AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE); - - uint32_t scratch_dword0 = scratch_va & 0xffffffff; - uint32_t scratch_dword1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32); - - if (sctx->gfx_level >= GFX11) - scratch_dword1 |= S_008F04_SWIZZLE_ENABLE_GFX11(1); - else - scratch_dword1 |= S_008F04_SWIZZLE_ENABLE_GFX6(1); - - /* Disable address clamping */ - uint32_t scratch_dword2 = 0xffffffff; - uint32_t index_stride = sctx->cs_shader_state.program->shader.wave_size == 32 ? 2 : 3; - uint32_t scratch_dword3 = S_008F0C_INDEX_STRIDE(index_stride) | S_008F0C_ADD_TID_ENABLE(1); - - if (sctx->gfx_level >= GFX9) { - assert(max_private_element_size == 1); /* only 4 bytes on GFX9 */ - } else { - scratch_dword3 |= S_008F0C_ELEMENT_SIZE(max_private_element_size); - - if (sctx->gfx_level < GFX8) { - /* BUF_DATA_FORMAT is ignored, but it cannot be - * BUF_DATA_FORMAT_INVALID. */ - scratch_dword3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_8); - } - } - - radeon_begin(cs); - radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4); - radeon_emit(scratch_dword0); - radeon_emit(scratch_dword1); - radeon_emit(scratch_dword2); - radeon_emit(scratch_dword3); - radeon_end(); -} - -static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_code_t *code_object, - const struct pipe_grid_info *info, uint64_t kernel_args_va) -{ - struct si_compute *program = sctx->cs_shader_state.program; - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - - static const enum amd_code_property_mask_t workgroup_count_masks[] = { - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z}; - - unsigned i, user_sgpr = 0; - if (AMD_HSA_BITS_GET(code_object->code_properties, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) { - if (code_object->workitem_private_segment_byte_size > 0) { - setup_scratch_rsrc_user_sgprs(sctx, code_object, user_sgpr); - } - user_sgpr += 4; - } - - radeon_begin(cs); - - if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) { - struct dispatch_packet dispatch; - unsigned dispatch_offset; - struct si_resource *dispatch_buf = NULL; - uint64_t dispatch_va; - - /* Upload dispatch ptr */ - memset(&dispatch, 0, sizeof(dispatch)); - - dispatch.workgroup_size_x = util_cpu_to_le16(info->block[0]); - dispatch.workgroup_size_y = util_cpu_to_le16(info->block[1]); - dispatch.workgroup_size_z = util_cpu_to_le16(info->block[2]); - - dispatch.grid_size_x = util_cpu_to_le32(info->grid[0] * info->block[0]); - dispatch.grid_size_y = util_cpu_to_le32(info->grid[1] * info->block[1]); - dispatch.grid_size_z = util_cpu_to_le32(info->grid[2] * info->block[2]); - - dispatch.group_segment_size = - util_cpu_to_le32(program->sel.info.base.shared_size + info->variable_shared_mem); - - dispatch.kernarg_address = util_cpu_to_le64(kernel_args_va); - - u_upload_data(sctx->b.const_uploader, 0, sizeof(dispatch), 256, &dispatch, &dispatch_offset, - (struct pipe_resource **)&dispatch_buf); - - if (!dispatch_buf) { - fprintf(stderr, "Error: Failed to allocate dispatch " - "packet."); - } - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, dispatch_buf, - RADEON_USAGE_READ | RADEON_PRIO_CONST_BUFFER); - - dispatch_va = dispatch_buf->gpu_address + dispatch_offset; - - radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2); - radeon_emit(dispatch_va); - radeon_emit(S_008F04_BASE_ADDRESS_HI(dispatch_va >> 32) | S_008F04_STRIDE(0)); - - si_resource_reference(&dispatch_buf, NULL); - user_sgpr += 2; - } - - if (AMD_HSA_BITS_GET(code_object->code_properties, - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { - radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 2); - radeon_emit(kernel_args_va); - radeon_emit(S_008F04_BASE_ADDRESS_HI(kernel_args_va >> 32) | S_008F04_STRIDE(0)); - user_sgpr += 2; - } - - for (i = 0; i < 3 && user_sgpr < 16; i++) { - if (code_object->code_properties & workgroup_count_masks[i]) { - radeon_set_sh_reg_seq(R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 1); - radeon_emit(info->grid[i]); - user_sgpr += 1; - } - } - radeon_end(); -} - -static bool si_upload_compute_input(struct si_context *sctx, const amd_kernel_code_t *code_object, - const struct pipe_grid_info *info) -{ - struct si_compute *program = sctx->cs_shader_state.program; - struct si_resource *input_buffer = NULL; - uint32_t kernel_args_offset = 0; - uint32_t *kernel_args; - void *kernel_args_ptr; - uint64_t kernel_args_va; - - u_upload_alloc(sctx->b.const_uploader, 0, program->input_size, - sctx->screen->info.tcc_cache_line_size, &kernel_args_offset, - (struct pipe_resource **)&input_buffer, &kernel_args_ptr); - - if (unlikely(!kernel_args_ptr)) - return false; - - kernel_args = (uint32_t *)kernel_args_ptr; - kernel_args_va = input_buffer->gpu_address + kernel_args_offset; - - memcpy(kernel_args, info->input, program->input_size); - - for (unsigned i = 0; i < program->input_size / 4; i++) { - COMPUTE_DBG(sctx->screen, "input %u : %u\n", i, kernel_args[i]); - } - - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, input_buffer, - RADEON_USAGE_READ | RADEON_PRIO_CONST_BUFFER); - - si_setup_user_sgprs_co_v2(sctx, code_object, info, kernel_args_va); - si_resource_reference(&input_buffer, NULL); - return true; -} - static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_grid_info *info) { struct si_compute *program = sctx->cs_shader_state.program; @@ -1174,19 +886,17 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info struct si_context *sctx = (struct si_context *)ctx; struct si_screen *sscreen = sctx->screen; struct si_compute *program = sctx->cs_shader_state.program; - const amd_kernel_code_t *code_object = si_compute_get_code_object(program, info->pc); - int i; + + if (program->shader.compilation_failed) + return; + bool cs_regalloc_hang = sscreen->info.has_cs_regalloc_hang_bug && info->block[0] * info->block[1] * info->block[2] > 256; - if (cs_regalloc_hang) { sctx->barrier_flags |= SI_BARRIER_SYNC_PS | SI_BARRIER_SYNC_CS; si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier); } - if (program->ir_type != PIPE_SHADER_IR_NATIVE && program->shader.compilation_failed) - return; - si_check_dirty_buffers_textures(sctx); if (sctx->has_graphics) { @@ -1251,18 +961,14 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info /* First emit registers. */ bool prefetch; - if (!si_switch_compute_shader(sctx, program, &program->shader, code_object, info->pc, &prefetch, + if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc, &prefetch, info->variable_shared_mem)) return; si_emit_compute_shader_pointers(sctx); - if (program->ir_type == PIPE_SHADER_IR_NATIVE && - unlikely(!si_upload_compute_input(sctx, code_object, info))) - return; - /* Global buffers */ - for (i = 0; i < sctx->max_global_buffers; i++) { + for (unsigned i = 0; i < sctx->max_global_buffers; i++) { struct si_resource *buffer = si_resource(sctx->global_buffers[i]); if (!buffer) { continue; @@ -1283,8 +989,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info if (sctx->gfx_level >= GFX7 && sctx->screen->info.has_cp_dma && prefetch) si_cp_dma_prefetch(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0); - if (program->ir_type != PIPE_SHADER_IR_NATIVE) - si_setup_nir_user_data(sctx, info); + si_setup_nir_user_data(sctx, info); si_emit_dispatch_packets(sctx, info); @@ -1323,10 +1028,8 @@ void si_destroy_compute(struct si_compute *program) { struct si_shader_selector *sel = &program->sel; - if (program->ir_type != PIPE_SHADER_IR_NATIVE) { - util_queue_drop_job(&sel->screen->shader_compiler_queue, &sel->ready); - util_queue_fence_destroy(&sel->ready); - } + util_queue_drop_job(&sel->screen->shader_compiler_queue, &sel->ready); + util_queue_fence_destroy(&sel->ready); si_shader_destroy(&program->shader); ralloc_free(program->sel.nir); diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index 01606465a35..3eca3d8aad8 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -942,8 +942,6 @@ void si_init_shader_caps(struct si_screen *sscreen) caps->max_shader_images = SI_NUM_IMAGES; caps->supported_irs = (1 << PIPE_SHADER_IR_TGSI) | (1 << PIPE_SHADER_IR_NIR); - if (i == PIPE_SHADER_COMPUTE) - caps->supported_irs |= 1 << PIPE_SHADER_IR_NATIVE; /* Supported boolean features. */ caps->cont_supported = true; @@ -984,12 +982,7 @@ void si_init_compute_caps(struct si_screen *sscreen) caps->max_block_size[1] = caps->max_block_size[2] = 1024; - caps->max_block_size_clover[0] = - caps->max_block_size_clover[1] = - caps->max_block_size_clover[2] = 256; - caps->max_threads_per_block = 1024; - caps->max_threads_per_block_clover = 256; caps->address_bits = 64; /* Return 1/4 of the heap size as the maximum because the max size is not practically diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index fb0ffc6f42b..1a69bd45f89 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -828,10 +828,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign } if (sctx->gfx_level == GFX7) { - /* Clear the NULL constant buffer, because loads should return zeros. - * Note that this forces CP DMA to be used, because clover deadlocks - * for some reason when the compute codepath is used. - */ + /* Clear the NULL constant buffer, because loads should return zeros. */ uint32_t clear_value = 0; si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0, clear_value); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index db0750fb59e..a9207b3e20a 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -673,9 +673,6 @@ struct si_screen { struct si_compute { struct si_shader_selector sel; struct si_shader shader; - - unsigned ir_type; - unsigned input_size; }; struct si_sampler_view { |
