Skip to content

Commit

Permalink
Query register capacity for segred and segscan codegen.
Browse files Browse the repository at this point in the history
This is very tedious code, and required adding the notion of "kernel
constant expressions", as we have some expressions that _must_ be
constant at kernel compilation time (which is at program runtime). We
actually had this notion in the ImpCode representation, but now ImpGen
provides some manual control as well.
  • Loading branch information
athas committed Dec 7, 2023
1 parent 07fa0b4 commit 5c6af29
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 76 deletions.
2 changes: 1 addition & 1 deletion src/Futhark/CodeGen/Backends/GPU.hs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ getParamByKey :: Name -> C.Exp
getParamByKey key = [C.cexp|*ctx->tuning_params.$id:key|]

kernelConstToExp :: KernelConst -> C.Exp
kernelConstToExp (SizeConst key) =
kernelConstToExp (SizeConst key _) =
getParamByKey key
kernelConstToExp (SizeMaxConst size_class) =
[C.cexp|ctx->$id:field|]
Expand Down
2 changes: 1 addition & 1 deletion src/Futhark/CodeGen/Backends/PyOpenCL.hs
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ getParamByKey :: Name -> PyExp
getParamByKey key = Index (Var "self.sizes") (IdxExp $ String $ prettyText key)

kernelConstToExp :: Imp.KernelConst -> PyExp
kernelConstToExp (Imp.SizeConst key) =
kernelConstToExp (Imp.SizeConst key _) =
getParamByKey key
kernelConstToExp (Imp.SizeMaxConst size_class) =
Var $ "self.max_" <> prettyString size_class
Expand Down
2 changes: 1 addition & 1 deletion src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ getParamByKey :: Name -> PyExp
getParamByKey key = Index (Var "self.sizes") (IdxExp $ String $ prettyText key)

kernelConstToExp :: KernelConst -> PyExp
kernelConstToExp (SizeConst key) =
kernelConstToExp (SizeConst key _) =
getParamByKey key
kernelConstToExp (SizeMaxConst size_class) =
Var $ "self.max_" <> prettyString size_class
Expand Down
10 changes: 6 additions & 4 deletions src/Futhark/CodeGen/ImpCode/GPU.hs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ type KernelCode = Code KernelOp

-- | A run-time constant related to kernels.
data KernelConst
= SizeConst Name
= SizeConst Name SizeClass
| SizeMaxConst SizeClass
deriving (Eq, Ord, Show)

Expand Down Expand Up @@ -85,11 +85,13 @@ data KernelUse
deriving (Eq, Ord, Show)

instance Pretty KernelConst where
pretty (SizeConst key) = "get_size" <> parens (pretty key)
pretty (SizeMaxConst size_class) = "get_max_size" <> parens (pretty size_class)
pretty (SizeConst key size_class) =
"get_size" <> parens (commasep [pretty key, pretty size_class])
pretty (SizeMaxConst size_class) =
"get_max_size" <> parens (pretty size_class)

instance FreeIn KernelConst where
freeIn' (SizeConst _) = mempty
freeIn' SizeConst {} = mempty
freeIn' (SizeMaxConst _) = mempty

instance Pretty KernelUse where
Expand Down
77 changes: 50 additions & 27 deletions src/Futhark/CodeGen/ImpGen/GPU/Base.hs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ module Futhark.CodeGen.ImpGen.GPU.Base
updateAcc,
genZeroes,
isPrimParam,
kernelConstToExp,
getChunkSize,

-- * Host-level bulk operations
Expand Down Expand Up @@ -261,26 +262,40 @@ fenceForArrays = fmap (foldl' max Imp.FenceLocal) . mapM need
isPrimParam :: (Typed p) => Param p -> Bool
isPrimParam = primType . paramType

-- | Given a list of parameter types, compute the largest available chunk size
-- given the parameters for which we want chunking and the available resources.
-- Used in SegScan.SinglePass.compileSegScan, and SegRed.compileSegRed (with
-- primitive non-commutative operators only).
getChunkSize :: (Num a) => [Type] -> a
getChunkSize types = fromInteger $ max 1 $ min mem_constraint reg_constraint
kernelConstToExp :: Imp.KernelConstExp -> CallKernelGen Imp.Exp
kernelConstToExp = traverse f
where
types' = map elemType $ filter primType types
sizes = map primByteSize types'

sum_sizes = sum sizes
sum_sizes' = sum (map (max 4 . primByteSize) types') `div` 4
max_size = maximum sizes

mem_constraint = max k_mem sum_sizes `div` max_size
reg_constraint = (k_reg - 1 - sum_sizes') `div` (2 * sum_sizes')

-- TODO: Make these constants dynamic by querying device
k_reg = 64
k_mem = 95
f (Imp.SizeMaxConst c) = do
v <- dPrim (prettyString c) int64
sOp $ Imp.GetSizeMax (tvVar v) c
pure $ tvVar v
f (Imp.SizeConst k c) = do
v <- dPrim (nameToString k) int64
sOp $ Imp.GetSize (tvVar v) k c
pure $ tvVar v

-- | Given available register and cacha list of parameter types,
-- compute the largest available chunk size given the parameters for
-- which we want chunking and the available resources. Used in
-- 'SegScan.SinglePass.compileSegScan', and 'SegRed.compileSegRed'
-- (with primitive non-commutative operators only).
getChunkSize :: [Type] -> Imp.KernelConstExp
getChunkSize types = do
let max_group_size = Imp.SizeMaxConst SizeGroup
max_group_mem = Imp.SizeMaxConst SizeLocalMemory
max_group_reg = Imp.SizeMaxConst SizeRegisters
k_mem = le64 max_group_mem `quot` le64 max_group_size
k_reg = le64 max_group_reg `quot` le64 max_group_size
types' = map elemType $ filter primType types
sizes = map primByteSize types'

sum_sizes = sum sizes
sum_sizes' = sum (map (sMax64 4 . primByteSize) types') `quot` 4
max_size = maximum sizes

mem_constraint = max k_mem sum_sizes `quot` max_size
reg_constraint = (k_reg - 1 - sum_sizes') `quot` (2 * sum_sizes')
untyped $ sMax64 1 $ sMin64 mem_constraint reg_constraint

inBlockScan ::
KernelConstants ->
Expand Down Expand Up @@ -920,10 +935,10 @@ isConstExp vtable size = do
let onLeaf name _ = lookupConstExp name
lookupConstExp name =
constExp =<< hasExp =<< M.lookup name vtable
constExp (Op (Inner (SizeOp (GetSize key _)))) =
Just $ LeafExp (Imp.SizeConst $ keyWithEntryPoint fname key) int32
constExp (Op (Inner (SizeOp (GetSizeMax size_class)))) =
Just $ LeafExp (Imp.SizeMaxConst size_class) int32
constExp (Op (Inner (SizeOp (GetSize key c)))) =
Just $ LeafExp (Imp.SizeConst (keyWithEntryPoint fname key) c) int32
constExp (Op (Inner (SizeOp (GetSizeMax c)))) =
Just $ LeafExp (Imp.SizeMaxConst c) int32
constExp e = primExpFromExp lookupConstExp e
pure $ replaceInPrimExpM onLeaf size
where
Expand Down Expand Up @@ -1112,7 +1127,12 @@ data KernelAttrs = KernelAttrs
-- | Number of groups.
kAttrNumGroups :: Count NumGroups SubExp,
-- | Group size.
kAttrGroupSize :: Count GroupSize SubExp
kAttrGroupSize :: Count GroupSize SubExp,
-- | Variables that are specially in scope inside the kernel.
-- Operationally, these will be available at kernel compile time
-- (which happens at run-time, with access to machine-specific
-- information).
kAttrConstExps :: M.Map VName Imp.KernelConstExp
}

-- | The default kernel attributes.
Expand All @@ -1125,7 +1145,8 @@ defKernelAttrs num_groups group_size =
{ kAttrFailureTolerant = False,
kAttrCheckLocalMemory = True,
kAttrNumGroups = num_groups,
kAttrGroupSize = group_size
kAttrGroupSize = group_size,
kAttrConstExps = mempty
}

getSize :: String -> SizeClass -> CallKernelGen (TV Int64)
Expand Down Expand Up @@ -1190,12 +1211,12 @@ sKernelOp ::
sKernelOp attrs constants ops name m = do
HostEnv atomics _ locks <- askEnv
body <- makeAllMemoryGlobal $ subImpM_ (KernelEnv atomics constants locks) ops m
uses <- computeKernelUses body mempty
uses <- computeKernelUses body $ M.keys $ kAttrConstExps attrs
group_size <- onGroupSize $ kernelGroupSize constants
emit . Imp.Op . Imp.CallKernel $
Imp.Kernel
{ Imp.kernelBody = body,
Imp.kernelUses = uses,
Imp.kernelUses = uses <> map constToUse (M.toList (kAttrConstExps attrs)),
Imp.kernelNumGroups = [untyped $ kernelNumGroups constants],
Imp.kernelGroupSize = [group_size],
Imp.kernelName = name,
Expand All @@ -1213,6 +1234,8 @@ sKernelOp attrs constants ops name m = do
Just (LeafExp kc _) -> Right kc
_ -> Left $ untyped e

constToUse (v, e) = Imp.ConstUse v e

sKernelFailureTolerant ::
Bool ->
Operations GPUMem KernelEnv Imp.KernelOp ->
Expand Down
3 changes: 2 additions & 1 deletion src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,8 @@ compileSegHist ::
KernelBody GPUMem ->
CallKernelGen ()
compileSegHist (Pat pes) lvl space ops kbody = do
KernelAttrs _ _ num_groups group_size <- lvlKernelAttrs lvl
KernelAttrs {kAttrNumGroups = num_groups, kAttrGroupSize = group_size} <-
lvlKernelAttrs lvl
-- Most of this function is not the histogram part itself, but
-- rather figuring out whether to use a local or global memory
-- strategy, as well as collapsing the subhistograms produced (which
Expand Down
67 changes: 40 additions & 27 deletions src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ where

import Control.Monad
import Data.List (genericLength, zip4)
import Data.Map qualified as M
import Data.Maybe
import Futhark.CodeGen.ImpCode.GPU qualified as Imp
import Futhark.CodeGen.ImpGen
Expand Down Expand Up @@ -106,7 +107,8 @@ compileSegRed ::
CallKernelGen ()
compileSegRed pat lvl space segbinops map_kbody = do
emit $ Imp.DebugPrint "\n# SegRed" Nothing
KernelAttrs _ _ num_groups group_size <- lvlKernelAttrs lvl
KernelAttrs {kAttrNumGroups = num_groups, kAttrGroupSize = group_size} <-
lvlKernelAttrs lvl
let grid = KernelGrid num_groups group_size

compileSegRed' pat grid space segbinops $ \red_cont ->
Expand Down Expand Up @@ -142,31 +144,32 @@ compileSegRed' pat grid space segbinops map_body_cont
| genericLength segbinops > maxNumOps =
compilerLimitationS $
"compileSegRed': at most " ++ show maxNumOps ++ " reduction operators are supported."
| [(_, Constant (IntValue (Int64Value 1))), _] <- unSegSpace space =
compileReduction nonsegmentedReduction
| otherwise = do
let segment_size = pe64 $ last $ segSpaceDims space
use_small_segments = segment_size * 2 .<. group_size_E * chunk_E
sIf
use_small_segments
(compileReduction smallSegmentsReduction)
(compileReduction largeSegmentsReduction)
chunk_v <- dPrimV "chunk_size" . isInt64 =<< kernelConstToExp chunk_const
case unSegSpace space of
[(_, Constant (IntValue (Int64Value 1))), _] ->
compileReduction (chunk_v, chunk_const) nonsegmentedReduction
_ -> do
let segment_size = pe64 $ last $ segSpaceDims space
use_small_segments = segment_size * 2 .<. pe64 (unCount group_size) * tvExp chunk_v
sIf
use_small_segments
(compileReduction (chunk_v, chunk_const) smallSegmentsReduction)
(compileReduction (chunk_v, chunk_const) largeSegmentsReduction)
where
compileReduction f =
compileReduction chunk f =
f pat num_groups group_size chunk space segbinops map_body_cont

chunk
| Noncommutative <- mconcat (map segBinOpComm segbinops),
all isPrimSegBinOp segbinops =
intConst Int64 $ getChunkSize param_types
| otherwise = intConst Int64 1

param_types = map paramType $ concatMap paramOf segbinops

num_groups = gridNumGroups grid
group_size = gridGroupSize grid
group_size_E = pe64 $ unCount group_size
chunk_E = pe64 chunk

chunk_const =
if Noncommutative `elem` map segBinOpComm segbinops
&& all isPrimSegBinOp segbinops
then getChunkSize param_types
else Imp.ValueExp $ IntValue $ intValue Int64 (1 :: Int64)

-- | Prepare intermediate arrays for the reduction. Prim-typed
-- arguments go in local memory (so we need to do the allocation of
Expand Down Expand Up @@ -290,16 +293,16 @@ type DoCompileSegRed =
Pat LetDecMem ->
Count NumGroups SubExp ->
Count GroupSize SubExp ->
SubExp ->
(TV Int64, Imp.KernelConstExp) ->
SegSpace ->
[SegBinOp GPUMem] ->
DoSegBody ->
CallKernelGen ()

nonsegmentedReduction :: DoCompileSegRed
nonsegmentedReduction (Pat segred_pes) num_groups group_size chunk_se space segbinops map_body_cont = do
nonsegmentedReduction (Pat segred_pes) num_groups group_size (chunk_v, chunk_const) space segbinops map_body_cont = do
let (gtids, dims) = unzip $ unSegSpace space
chunk = pe64 chunk_se
chunk = tvExp chunk_v
num_groups_se = unCount num_groups
group_size_se = unCount group_size
group_size' = pe64 group_size_se
Expand All @@ -313,12 +316,17 @@ nonsegmentedReduction (Pat segred_pes) num_groups group_size chunk_se space segb
num_threads <-
fmap tvSize $ dPrimV "num_threads" $ pe64 num_groups_se * group_size'

sKernelThread "segred_nonseg" (segFlat space) (defKernelAttrs num_groups group_size) $ do
let attrs =
(defKernelAttrs num_groups group_size)
{ kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const
}

sKernelThread "segred_nonseg" (segFlat space) attrs $ do
constants <- kernelConstants <$> askEnv
let ltid = kernelLocalThreadId constants
let group_id = kernelGroupId constants

interms <- makeIntermArrays (sExt64 group_id) group_size_se chunk_se segbinops
interms <- makeIntermArrays (sExt64 group_id) group_size_se (tvSize chunk_v) segbinops
sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "local"

-- Since this is the nonsegmented case, all outer segment IDs must
Expand Down Expand Up @@ -473,15 +481,15 @@ smallSegmentsReduction (Pat segred_pes) num_groups group_size _ space segbinops
sOp $ Imp.Barrier Imp.FenceLocal

largeSegmentsReduction :: DoCompileSegRed
largeSegmentsReduction (Pat segred_pes) num_groups group_size chunk_se space segbinops map_body_cont = do
largeSegmentsReduction (Pat segred_pes) num_groups group_size (chunk_v, chunk_const) space segbinops map_body_cont = do
let (gtids, dims) = unzip $ unSegSpace space
dims' = map pe64 dims
num_segments = product $ init dims'
segment_size = last dims'
num_groups' = pe64 $ unCount num_groups
group_size_se = unCount group_size
group_size' = pe64 group_size_se
chunk = pe64 chunk_se
chunk = tvExp chunk_v

groups_per_segment <-
dPrimVE "groups_per_segment" $
Expand Down Expand Up @@ -522,12 +530,17 @@ largeSegmentsReduction (Pat segred_pes) num_groups group_size chunk_se space seg
let num_counters = maxNumOps * 1024
counters <- genZeroes "counters" $ fromIntegral num_counters

sKernelThread "segred_large" (segFlat space) (defKernelAttrs num_groups group_size) $ do
let attrs =
(defKernelAttrs num_groups group_size)
{ kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const
}

sKernelThread "segred_large" (segFlat space) attrs $ do
constants <- kernelConstants <$> askEnv
let group_id = sExt64 $ kernelGroupId constants
ltid = kernelLocalThreadId constants

interms <- makeIntermArrays group_id group_size_se chunk_se segbinops
interms <- makeIntermArrays group_id group_size_se (tvSize chunk_v) segbinops
sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "local"

-- We probably do not have enough actual workgroups to cover the
Expand Down
Loading

0 comments on commit 5c6af29

Please sign in to comment.