swr/rast: Unaligned and translations in gathers

- added graphics address translation in odd gathers - added support for unaligned gathers in fetch shader - changed how 2+ GB offsets are handled to make them compatible with unaligned offsets
author: Alok Hota <alok.hota@intel.com> 2018-08-30 17:45:06 -0500
committer: Alok Hota <alok.hota@intel.com> 2019-01-16 13:53:30 -0600
commit: c722ad737907b58efa04c8ef0bd330c5adee4043 (patch)
tree: cb7662463cf02b9c5985436312b209778c37dc3e
parent: 9459863dfa16071ff5088a15d853028f2865c4a7 (diff)
1 files changed, 35 insertions, 21 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index d294a67050c..6feb1a76e63 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -368,7 +368,7 @@ void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[
 // gather SIMD full pixels per lane then shift/mask to move each component to their
 // own vector
 void FetchJit::CreateGatherOddFormats(
-    SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
+    SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4])
 {
     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 
@@ -378,7 +378,7 @@ void FetchJit::CreateGatherOddFormats(
     Value* pGather;
     if (info.bpp == 32)
     {
-        pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+        pGather = GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask);
     }
     else
     {
@@ -386,29 +386,40 @@ void FetchJit::CreateGatherOddFormats(
         Value* pMem = ALLOCA(mSimdInt32Ty);
         STORE(VIMMED1(0u), pMem);
 
-        pBase          = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
-        Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
+        Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy);
 
         for (uint32_t lane = 0; lane < mVWidth; ++lane)
         {
             // Get index
             Value* index = VEXTRACT(pOffsets, C(lane));
             Value* mask  = VEXTRACT(pMask, C(lane));
+
+            // use branch around load based on mask
+            // Needed to avoid page-faults on unmasked lanes
+            BasicBlock* pCurrentBB = IRB()->GetInsertBlock();
+            BasicBlock* pMaskedLoadBlock =
+                BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent());
+            BasicBlock* pEndLoadBB = BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent());
+
+            COND_BR(mask, pMaskedLoadBlock, pEndLoadBB);
+
+            JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock);
+
             switch (info.bpp)
             {
             case 8:
             {
                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
-                Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
-                STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
+                STORE(LOAD(xpSrc, "", mInt8PtrTy, GFX_MEM_CLIENT_FETCH), pDst);
                 break;
             }
 
             case 16:
             {
                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
-                Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
-                STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
+                STORE(LOAD(xpSrc, "", mInt16PtrTy, GFX_MEM_CLIENT_FETCH), pDst);
                 break;
             }
             break;
@@ -417,13 +428,13 @@ void FetchJit::CreateGatherOddFormats(
             {
                 // First 16-bits of data
                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
-                Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
-                STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
+                STORE(LOAD(xpSrc, "", mInt16PtrTy, GFX_MEM_CLIENT_FETCH), pDst);
 
                 // Last 8-bits of data
                 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
-                pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
-                STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+                xpSrc = ADD(xpSrc, C(2));
+                STORE(LOAD(xpSrc, "", mInt8PtrTy, GFX_MEM_CLIENT_FETCH), pDst);
                 break;
             }
 
@@ -431,6 +442,9 @@ void FetchJit::CreateGatherOddFormats(
                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
                 break;
             }
+
+            BR(pEndLoadBB);
+            JM()->mBuilder.SetInsertPoint(pEndLoadBB);
         }
 
         pGather = LOAD(pMem);
@@ -616,7 +630,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
         // do 64bit address offset calculations.
 
         // calculate byte offset to the start of the VB
-        Value* baseOffset     = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
+        Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 
         // VGATHER* takes an *i8 src pointer so that's what stream is
         Value* pStreamBaseGFX = ADD(stream, baseOffset);
@@ -781,17 +795,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
                         {
                             // Gather a SIMD of vertices
                             // APIs allow a 4GB range for offsets
-                            // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
-                            // But, we know that elements must be aligned for FETCH. :)
-                            // Right shift the offset by a bit and then scale by 2 to remove the
-                            // sign extension.
-                            Value* vShiftedOffsets = LSHR(vOffsets, 1);
+                            // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :(
+                            // Add 2GB to the base pointer and 2GB to the offsets.  This makes
+                            // "negative" (large) offsets into positive offsets and small offsets
+                            // into negative offsets.
+                            Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000));
                             vVertexElements[currentVertexElement++] =
                                 GATHERPS(gatherSrc,
-                                         pStreamBaseGFX,
-                                         vShiftedOffsets,
+                                         ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)),
+                                         vNewOffsets,
                                          vGatherMask,
-                                         2,
+                                         1,
                                          GFX_MEM_CLIENT_FETCH);
                         }
                         else
author	Alok Hota <alok.hota@intel.com>	2018-08-30 17:45:06 -0500
committer	Alok Hota <alok.hota@intel.com>	2019-01-16 13:53:30 -0600
commit	c722ad737907b58efa04c8ef0bd330c5adee4043 (patch)
tree	cb7662463cf02b9c5985436312b209778c37dc3e
parent	9459863dfa16071ff5088a15d853028f2865c4a7 (diff)