Query register capacity for segred and segscan codegen.

This is very tedious code, and required adding the notion of "kernel constant expressions", as we have some expressions that _must_ be constant at kernel compilation time (which is at program runtime). We actually had this notion in the ImpCode representation, but now ImpGen provides some manual control as well.
diku-dk · Dec 7, 2023 · 5c6af29 · 5c6af29
1 parent 07fa0b4
commit 5c6af29
Show file tree

Hide file tree

Showing 9 changed files with 121 additions and 76 deletions.
diff --git a/src/Futhark/CodeGen/Backends/GPU.hs b/src/Futhark/CodeGen/Backends/GPU.hs
@@ -71,7 +71,7 @@ getParamByKey :: Name -> C.Exp
 getParamByKey key = [C.cexp|*ctx->tuning_params.$id:key|]
 
 kernelConstToExp :: KernelConst -> C.Exp
-kernelConstToExp (SizeConst key) =
+kernelConstToExp (SizeConst key _) =
   getParamByKey key
 kernelConstToExp (SizeMaxConst size_class) =
   [C.cexp|ctx->$id:field|]

diff --git a/src/Futhark/CodeGen/Backends/PyOpenCL.hs b/src/Futhark/CodeGen/Backends/PyOpenCL.hs
@@ -207,7 +207,7 @@ getParamByKey :: Name -> PyExp
 getParamByKey key = Index (Var "self.sizes") (IdxExp $ String $ prettyText key)
 
 kernelConstToExp :: Imp.KernelConst -> PyExp
-kernelConstToExp (Imp.SizeConst key) =
+kernelConstToExp (Imp.SizeConst key _) =
   getParamByKey key
 kernelConstToExp (Imp.SizeMaxConst size_class) =
   Var $ "self.max_" <> prettyString size_class

diff --git a/src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs b/src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs
@@ -34,7 +34,7 @@ getParamByKey :: Name -> PyExp
 getParamByKey key = Index (Var "self.sizes") (IdxExp $ String $ prettyText key)
 
 kernelConstToExp :: KernelConst -> PyExp
-kernelConstToExp (SizeConst key) =
+kernelConstToExp (SizeConst key _) =
   getParamByKey key
 kernelConstToExp (SizeMaxConst size_class) =
   Var $ "self.max_" <> prettyString size_class

diff --git a/src/Futhark/CodeGen/ImpCode/GPU.hs b/src/Futhark/CodeGen/ImpCode/GPU.hs
@@ -34,7 +34,7 @@ type KernelCode = Code KernelOp
 
 -- | A run-time constant related to kernels.
 data KernelConst
-  = SizeConst Name
+  = SizeConst Name SizeClass
   | SizeMaxConst SizeClass
   deriving (Eq, Ord, Show)
 
@@ -85,11 +85,13 @@ data KernelUse
   deriving (Eq, Ord, Show)
 
 instance Pretty KernelConst where
-  pretty (SizeConst key) = "get_size" <> parens (pretty key)
-  pretty (SizeMaxConst size_class) = "get_max_size" <> parens (pretty size_class)
+  pretty (SizeConst key size_class) =
+    "get_size" <> parens (commasep [pretty key, pretty size_class])
+  pretty (SizeMaxConst size_class) =
+    "get_max_size" <> parens (pretty size_class)
 
 instance FreeIn KernelConst where
-  freeIn' (SizeConst _) = mempty
+  freeIn' SizeConst {} = mempty
   freeIn' (SizeMaxConst _) = mempty
 
 instance Pretty KernelUse where

diff --git a/src/Futhark/CodeGen/ImpGen/GPU/Base.hs b/src/Futhark/CodeGen/ImpGen/GPU/Base.hs
@@ -30,6 +30,7 @@ module Futhark.CodeGen.ImpGen.GPU.Base
     updateAcc,
     genZeroes,
     isPrimParam,
+    kernelConstToExp,
     getChunkSize,
 
     -- * Host-level bulk operations
@@ -261,26 +262,40 @@ fenceForArrays = fmap (foldl' max Imp.FenceLocal) . mapM need
 isPrimParam :: (Typed p) => Param p -> Bool
 isPrimParam = primType . paramType
 
--- | Given a list of parameter types, compute the largest available chunk size
--- given the parameters for which we want chunking and the available resources.
--- Used in SegScan.SinglePass.compileSegScan, and SegRed.compileSegRed (with
--- primitive non-commutative operators only).
-getChunkSize :: (Num a) => [Type] -> a
-getChunkSize types = fromInteger $ max 1 $ min mem_constraint reg_constraint
+kernelConstToExp :: Imp.KernelConstExp -> CallKernelGen Imp.Exp
+kernelConstToExp = traverse f
   where
-    types' = map elemType $ filter primType types
-    sizes = map primByteSize types'
-
-    sum_sizes = sum sizes
-    sum_sizes' = sum (map (max 4 . primByteSize) types') `div` 4
-    max_size = maximum sizes
-
-    mem_constraint = max k_mem sum_sizes `div` max_size
-    reg_constraint = (k_reg - 1 - sum_sizes') `div` (2 * sum_sizes')
-
-    -- TODO: Make these constants dynamic by querying device
-    k_reg = 64
-    k_mem = 95
+    f (Imp.SizeMaxConst c) = do
+      v <- dPrim (prettyString c) int64
+      sOp $ Imp.GetSizeMax (tvVar v) c
+      pure $ tvVar v
+    f (Imp.SizeConst k c) = do
+      v <- dPrim (nameToString k) int64
+      sOp $ Imp.GetSize (tvVar v) k c
+      pure $ tvVar v
+
+-- | Given available register and cacha list of parameter types,
+-- compute the largest available chunk size given the parameters for
+-- which we want chunking and the available resources. Used in
+-- 'SegScan.SinglePass.compileSegScan', and 'SegRed.compileSegRed'
+-- (with primitive non-commutative operators only).
+getChunkSize :: [Type] -> Imp.KernelConstExp
+getChunkSize types = do
+  let max_group_size = Imp.SizeMaxConst SizeGroup
+      max_group_mem = Imp.SizeMaxConst SizeLocalMemory
+      max_group_reg = Imp.SizeMaxConst SizeRegisters
+      k_mem = le64 max_group_mem `quot` le64 max_group_size
+      k_reg = le64 max_group_reg `quot` le64 max_group_size
+      types' = map elemType $ filter primType types
+      sizes = map primByteSize types'
+
+      sum_sizes = sum sizes
+      sum_sizes' = sum (map (sMax64 4 . primByteSize) types') `quot` 4
+      max_size = maximum sizes
+
+      mem_constraint = max k_mem sum_sizes `quot` max_size
+      reg_constraint = (k_reg - 1 - sum_sizes') `quot` (2 * sum_sizes')
+  untyped $ sMax64 1 $ sMin64 mem_constraint reg_constraint
 
 inBlockScan ::
   KernelConstants ->
@@ -920,10 +935,10 @@ isConstExp vtable size = do
   let onLeaf name _ = lookupConstExp name
       lookupConstExp name =
         constExp =<< hasExp =<< M.lookup name vtable
-      constExp (Op (Inner (SizeOp (GetSize key _)))) =
-        Just $ LeafExp (Imp.SizeConst $ keyWithEntryPoint fname key) int32
-      constExp (Op (Inner (SizeOp (GetSizeMax size_class)))) =
-        Just $ LeafExp (Imp.SizeMaxConst size_class) int32
+      constExp (Op (Inner (SizeOp (GetSize key c)))) =
+        Just $ LeafExp (Imp.SizeConst (keyWithEntryPoint fname key) c) int32
+      constExp (Op (Inner (SizeOp (GetSizeMax c)))) =
+        Just $ LeafExp (Imp.SizeMaxConst c) int32
       constExp e = primExpFromExp lookupConstExp e
   pure $ replaceInPrimExpM onLeaf size
   where
@@ -1112,7 +1127,12 @@ data KernelAttrs = KernelAttrs
     -- | Number of groups.
     kAttrNumGroups :: Count NumGroups SubExp,
     -- | Group size.
-    kAttrGroupSize :: Count GroupSize SubExp
+    kAttrGroupSize :: Count GroupSize SubExp,
+    -- | Variables that are specially in scope inside the kernel.
+    -- Operationally, these will be available at kernel compile time
+    -- (which happens at run-time, with access to machine-specific
+    -- information).
+    kAttrConstExps :: M.Map VName Imp.KernelConstExp
   }
 
 -- | The default kernel attributes.
@@ -1125,7 +1145,8 @@ defKernelAttrs num_groups group_size =
     { kAttrFailureTolerant = False,
       kAttrCheckLocalMemory = True,
       kAttrNumGroups = num_groups,
-      kAttrGroupSize = group_size
+      kAttrGroupSize = group_size,
+      kAttrConstExps = mempty
     }
 
 getSize :: String -> SizeClass -> CallKernelGen (TV Int64)
@@ -1190,12 +1211,12 @@ sKernelOp ::
 sKernelOp attrs constants ops name m = do
   HostEnv atomics _ locks <- askEnv
   body <- makeAllMemoryGlobal $ subImpM_ (KernelEnv atomics constants locks) ops m
-  uses <- computeKernelUses body mempty
+  uses <- computeKernelUses body $ M.keys $ kAttrConstExps attrs
   group_size <- onGroupSize $ kernelGroupSize constants
   emit . Imp.Op . Imp.CallKernel $
     Imp.Kernel
       { Imp.kernelBody = body,
-        Imp.kernelUses = uses,
+        Imp.kernelUses = uses <> map constToUse (M.toList (kAttrConstExps attrs)),
         Imp.kernelNumGroups = [untyped $ kernelNumGroups constants],
         Imp.kernelGroupSize = [group_size],
         Imp.kernelName = name,
@@ -1213,6 +1234,8 @@ sKernelOp attrs constants ops name m = do
           Just (LeafExp kc _) -> Right kc
           _ -> Left $ untyped e
 
+    constToUse (v, e) = Imp.ConstUse v e
+
 sKernelFailureTolerant ::
   Bool ->
   Operations GPUMem KernelEnv Imp.KernelOp ->

diff --git a/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs b/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs
@@ -1065,7 +1065,8 @@ compileSegHist ::
   KernelBody GPUMem ->
   CallKernelGen ()
 compileSegHist (Pat pes) lvl space ops kbody = do
-  KernelAttrs _ _ num_groups group_size <- lvlKernelAttrs lvl
+  KernelAttrs {kAttrNumGroups = num_groups, kAttrGroupSize = group_size} <-
+    lvlKernelAttrs lvl
   -- Most of this function is not the histogram part itself, but
   -- rather figuring out whether to use a local or global memory
   -- strategy, as well as collapsing the subhistograms produced (which

diff --git a/src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs b/src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs
@@ -57,6 +57,7 @@ where
 
 import Control.Monad
 import Data.List (genericLength, zip4)
+import Data.Map qualified as M
 import Data.Maybe
 import Futhark.CodeGen.ImpCode.GPU qualified as Imp
 import Futhark.CodeGen.ImpGen
@@ -106,7 +107,8 @@ compileSegRed ::
   CallKernelGen ()
 compileSegRed pat lvl space segbinops map_kbody = do
   emit $ Imp.DebugPrint "\n# SegRed" Nothing
-  KernelAttrs _ _ num_groups group_size <- lvlKernelAttrs lvl
+  KernelAttrs {kAttrNumGroups = num_groups, kAttrGroupSize = group_size} <-
+    lvlKernelAttrs lvl
   let grid = KernelGrid num_groups group_size
 
   compileSegRed' pat grid space segbinops $ \red_cont ->
@@ -142,31 +144,32 @@ compileSegRed' pat grid space segbinops map_body_cont
   | genericLength segbinops > maxNumOps =
       compilerLimitationS $
         "compileSegRed': at most " ++ show maxNumOps ++ " reduction operators are supported."
-  | [(_, Constant (IntValue (Int64Value 1))), _] <- unSegSpace space =
-      compileReduction nonsegmentedReduction
   | otherwise = do
-      let segment_size = pe64 $ last $ segSpaceDims space
-          use_small_segments = segment_size * 2 .<. group_size_E * chunk_E
-      sIf
-        use_small_segments
-        (compileReduction smallSegmentsReduction)
-        (compileReduction largeSegmentsReduction)
+      chunk_v <- dPrimV "chunk_size" . isInt64 =<< kernelConstToExp chunk_const
+      case unSegSpace space of
+        [(_, Constant (IntValue (Int64Value 1))), _] ->
+          compileReduction (chunk_v, chunk_const) nonsegmentedReduction
+        _ -> do
+          let segment_size = pe64 $ last $ segSpaceDims space
+              use_small_segments = segment_size * 2 .<. pe64 (unCount group_size) * tvExp chunk_v
+          sIf
+            use_small_segments
+            (compileReduction (chunk_v, chunk_const) smallSegmentsReduction)
+            (compileReduction (chunk_v, chunk_const) largeSegmentsReduction)
   where
-    compileReduction f =
+    compileReduction chunk f =
       f pat num_groups group_size chunk space segbinops map_body_cont
 
-    chunk
-      | Noncommutative <- mconcat (map segBinOpComm segbinops),
-        all isPrimSegBinOp segbinops =
-          intConst Int64 $ getChunkSize param_types
-      | otherwise = intConst Int64 1
-
     param_types = map paramType $ concatMap paramOf segbinops
 
     num_groups = gridNumGroups grid
     group_size = gridGroupSize grid
-    group_size_E = pe64 $ unCount group_size
-    chunk_E = pe64 chunk
+
+    chunk_const =
+      if Noncommutative `elem` map segBinOpComm segbinops
+        && all isPrimSegBinOp segbinops
+        then getChunkSize param_types
+        else Imp.ValueExp $ IntValue $ intValue Int64 (1 :: Int64)
 
 -- | Prepare intermediate arrays for the reduction.  Prim-typed
 -- arguments go in local memory (so we need to do the allocation of
@@ -290,16 +293,16 @@ type DoCompileSegRed =
   Pat LetDecMem ->
   Count NumGroups SubExp ->
   Count GroupSize SubExp ->
-  SubExp ->
+  (TV Int64, Imp.KernelConstExp) ->
   SegSpace ->
   [SegBinOp GPUMem] ->
   DoSegBody ->
   CallKernelGen ()
 
 nonsegmentedReduction :: DoCompileSegRed
-nonsegmentedReduction (Pat segred_pes) num_groups group_size chunk_se space segbinops map_body_cont = do
+nonsegmentedReduction (Pat segred_pes) num_groups group_size (chunk_v, chunk_const) space segbinops map_body_cont = do
   let (gtids, dims) = unzip $ unSegSpace space
-      chunk = pe64 chunk_se
+      chunk = tvExp chunk_v
       num_groups_se = unCount num_groups
       group_size_se = unCount group_size
       group_size' = pe64 group_size_se
@@ -313,12 +316,17 @@ nonsegmentedReduction (Pat segred_pes) num_groups group_size chunk_se space segb
   num_threads <-
     fmap tvSize $ dPrimV "num_threads" $ pe64 num_groups_se * group_size'
 
-  sKernelThread "segred_nonseg" (segFlat space) (defKernelAttrs num_groups group_size) $ do
+  let attrs =
+        (defKernelAttrs num_groups group_size)
+          { kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const
+          }
+
+  sKernelThread "segred_nonseg" (segFlat space) attrs $ do
     constants <- kernelConstants <$> askEnv
     let ltid = kernelLocalThreadId constants
     let group_id = kernelGroupId constants
 
-    interms <- makeIntermArrays (sExt64 group_id) group_size_se chunk_se segbinops
+    interms <- makeIntermArrays (sExt64 group_id) group_size_se (tvSize chunk_v) segbinops
     sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "local"
 
     -- Since this is the nonsegmented case, all outer segment IDs must
@@ -473,15 +481,15 @@ smallSegmentsReduction (Pat segred_pes) num_groups group_size _ space segbinops
       sOp $ Imp.Barrier Imp.FenceLocal
 
 largeSegmentsReduction :: DoCompileSegRed
-largeSegmentsReduction (Pat segred_pes) num_groups group_size chunk_se space segbinops map_body_cont = do
+largeSegmentsReduction (Pat segred_pes) num_groups group_size (chunk_v, chunk_const) space segbinops map_body_cont = do
   let (gtids, dims) = unzip $ unSegSpace space
       dims' = map pe64 dims
       num_segments = product $ init dims'
       segment_size = last dims'
       num_groups' = pe64 $ unCount num_groups
       group_size_se = unCount group_size
       group_size' = pe64 group_size_se
-      chunk = pe64 chunk_se
+      chunk = tvExp chunk_v
 
   groups_per_segment <-
     dPrimVE "groups_per_segment" $
@@ -522,12 +530,17 @@ largeSegmentsReduction (Pat segred_pes) num_groups group_size chunk_se space seg
   let num_counters = maxNumOps * 1024
   counters <- genZeroes "counters" $ fromIntegral num_counters
 
-  sKernelThread "segred_large" (segFlat space) (defKernelAttrs num_groups group_size) $ do
+  let attrs =
+        (defKernelAttrs num_groups group_size)
+          { kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const
+          }
+
+  sKernelThread "segred_large" (segFlat space) attrs $ do
     constants <- kernelConstants <$> askEnv
     let group_id = sExt64 $ kernelGroupId constants
         ltid = kernelLocalThreadId constants
 
-    interms <- makeIntermArrays group_id group_size_se chunk_se segbinops
+    interms <- makeIntermArrays group_id group_size_se (tvSize chunk_v) segbinops
     sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "local"
 
     -- We probably do not have enough actual workgroups to cover the