From 5c6af293d6461972a28bb46cd2c13edc9248a7d1 Mon Sep 17 00:00:00 2001 From: Troels Henriksen Date: Wed, 6 Dec 2023 23:05:38 +0100 Subject: [PATCH] Query register capacity for segred and segscan codegen. This is very tedious code, and required adding the notion of "kernel constant expressions", as we have some expressions that _must_ be constant at kernel compilation time (which is at program runtime). We actually had this notion in the ImpCode representation, but now ImpGen provides some manual control as well. --- src/Futhark/CodeGen/Backends/GPU.hs | 2 +- src/Futhark/CodeGen/Backends/PyOpenCL.hs | 2 +- .../CodeGen/Backends/PyOpenCL/Boilerplate.hs | 2 +- src/Futhark/CodeGen/ImpCode/GPU.hs | 10 ++- src/Futhark/CodeGen/ImpGen/GPU/Base.hs | 77 ++++++++++++------- src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs | 3 +- src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs | 67 +++++++++------- .../CodeGen/ImpGen/GPU/SegScan/SinglePass.hs | 32 ++++---- src/Futhark/CodeGen/ImpGen/GPU/ToOpenCL.hs | 2 +- 9 files changed, 121 insertions(+), 76 deletions(-) diff --git a/src/Futhark/CodeGen/Backends/GPU.hs b/src/Futhark/CodeGen/Backends/GPU.hs index 9dd1859e2f..a2afba4321 100644 --- a/src/Futhark/CodeGen/Backends/GPU.hs +++ b/src/Futhark/CodeGen/Backends/GPU.hs @@ -71,7 +71,7 @@ getParamByKey :: Name -> C.Exp getParamByKey key = [C.cexp|*ctx->tuning_params.$id:key|] kernelConstToExp :: KernelConst -> C.Exp -kernelConstToExp (SizeConst key) = +kernelConstToExp (SizeConst key _) = getParamByKey key kernelConstToExp (SizeMaxConst size_class) = [C.cexp|ctx->$id:field|] diff --git a/src/Futhark/CodeGen/Backends/PyOpenCL.hs b/src/Futhark/CodeGen/Backends/PyOpenCL.hs index 716aaeb355..3d2d6a85e1 100644 --- a/src/Futhark/CodeGen/Backends/PyOpenCL.hs +++ b/src/Futhark/CodeGen/Backends/PyOpenCL.hs @@ -207,7 +207,7 @@ getParamByKey :: Name -> PyExp getParamByKey key = Index (Var "self.sizes") (IdxExp $ String $ prettyText key) kernelConstToExp :: Imp.KernelConst -> PyExp -kernelConstToExp (Imp.SizeConst key) = +kernelConstToExp (Imp.SizeConst key _) = getParamByKey key kernelConstToExp (Imp.SizeMaxConst size_class) = Var $ "self.max_" <> prettyString size_class diff --git a/src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs b/src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs index 65e6f65e9c..aec8fda35c 100644 --- a/src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs +++ b/src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs @@ -34,7 +34,7 @@ getParamByKey :: Name -> PyExp getParamByKey key = Index (Var "self.sizes") (IdxExp $ String $ prettyText key) kernelConstToExp :: KernelConst -> PyExp -kernelConstToExp (SizeConst key) = +kernelConstToExp (SizeConst key _) = getParamByKey key kernelConstToExp (SizeMaxConst size_class) = Var $ "self.max_" <> prettyString size_class diff --git a/src/Futhark/CodeGen/ImpCode/GPU.hs b/src/Futhark/CodeGen/ImpCode/GPU.hs index 6241e17de5..d526c9e2de 100644 --- a/src/Futhark/CodeGen/ImpCode/GPU.hs +++ b/src/Futhark/CodeGen/ImpCode/GPU.hs @@ -34,7 +34,7 @@ type KernelCode = Code KernelOp -- | A run-time constant related to kernels. data KernelConst - = SizeConst Name + = SizeConst Name SizeClass | SizeMaxConst SizeClass deriving (Eq, Ord, Show) @@ -85,11 +85,13 @@ data KernelUse deriving (Eq, Ord, Show) instance Pretty KernelConst where - pretty (SizeConst key) = "get_size" <> parens (pretty key) - pretty (SizeMaxConst size_class) = "get_max_size" <> parens (pretty size_class) + pretty (SizeConst key size_class) = + "get_size" <> parens (commasep [pretty key, pretty size_class]) + pretty (SizeMaxConst size_class) = + "get_max_size" <> parens (pretty size_class) instance FreeIn KernelConst where - freeIn' (SizeConst _) = mempty + freeIn' SizeConst {} = mempty freeIn' (SizeMaxConst _) = mempty instance Pretty KernelUse where diff --git a/src/Futhark/CodeGen/ImpGen/GPU/Base.hs b/src/Futhark/CodeGen/ImpGen/GPU/Base.hs index 34b109e2b9..2dc29789d1 100644 --- a/src/Futhark/CodeGen/ImpGen/GPU/Base.hs +++ b/src/Futhark/CodeGen/ImpGen/GPU/Base.hs @@ -30,6 +30,7 @@ module Futhark.CodeGen.ImpGen.GPU.Base updateAcc, genZeroes, isPrimParam, + kernelConstToExp, getChunkSize, -- * Host-level bulk operations @@ -261,26 +262,40 @@ fenceForArrays = fmap (foldl' max Imp.FenceLocal) . mapM need isPrimParam :: (Typed p) => Param p -> Bool isPrimParam = primType . paramType --- | Given a list of parameter types, compute the largest available chunk size --- given the parameters for which we want chunking and the available resources. --- Used in SegScan.SinglePass.compileSegScan, and SegRed.compileSegRed (with --- primitive non-commutative operators only). -getChunkSize :: (Num a) => [Type] -> a -getChunkSize types = fromInteger $ max 1 $ min mem_constraint reg_constraint +kernelConstToExp :: Imp.KernelConstExp -> CallKernelGen Imp.Exp +kernelConstToExp = traverse f where - types' = map elemType $ filter primType types - sizes = map primByteSize types' - - sum_sizes = sum sizes - sum_sizes' = sum (map (max 4 . primByteSize) types') `div` 4 - max_size = maximum sizes - - mem_constraint = max k_mem sum_sizes `div` max_size - reg_constraint = (k_reg - 1 - sum_sizes') `div` (2 * sum_sizes') - - -- TODO: Make these constants dynamic by querying device - k_reg = 64 - k_mem = 95 + f (Imp.SizeMaxConst c) = do + v <- dPrim (prettyString c) int64 + sOp $ Imp.GetSizeMax (tvVar v) c + pure $ tvVar v + f (Imp.SizeConst k c) = do + v <- dPrim (nameToString k) int64 + sOp $ Imp.GetSize (tvVar v) k c + pure $ tvVar v + +-- | Given available register and cacha list of parameter types, +-- compute the largest available chunk size given the parameters for +-- which we want chunking and the available resources. Used in +-- 'SegScan.SinglePass.compileSegScan', and 'SegRed.compileSegRed' +-- (with primitive non-commutative operators only). +getChunkSize :: [Type] -> Imp.KernelConstExp +getChunkSize types = do + let max_group_size = Imp.SizeMaxConst SizeGroup + max_group_mem = Imp.SizeMaxConst SizeLocalMemory + max_group_reg = Imp.SizeMaxConst SizeRegisters + k_mem = le64 max_group_mem `quot` le64 max_group_size + k_reg = le64 max_group_reg `quot` le64 max_group_size + types' = map elemType $ filter primType types + sizes = map primByteSize types' + + sum_sizes = sum sizes + sum_sizes' = sum (map (sMax64 4 . primByteSize) types') `quot` 4 + max_size = maximum sizes + + mem_constraint = max k_mem sum_sizes `quot` max_size + reg_constraint = (k_reg - 1 - sum_sizes') `quot` (2 * sum_sizes') + untyped $ sMax64 1 $ sMin64 mem_constraint reg_constraint inBlockScan :: KernelConstants -> @@ -920,10 +935,10 @@ isConstExp vtable size = do let onLeaf name _ = lookupConstExp name lookupConstExp name = constExp =<< hasExp =<< M.lookup name vtable - constExp (Op (Inner (SizeOp (GetSize key _)))) = - Just $ LeafExp (Imp.SizeConst $ keyWithEntryPoint fname key) int32 - constExp (Op (Inner (SizeOp (GetSizeMax size_class)))) = - Just $ LeafExp (Imp.SizeMaxConst size_class) int32 + constExp (Op (Inner (SizeOp (GetSize key c)))) = + Just $ LeafExp (Imp.SizeConst (keyWithEntryPoint fname key) c) int32 + constExp (Op (Inner (SizeOp (GetSizeMax c)))) = + Just $ LeafExp (Imp.SizeMaxConst c) int32 constExp e = primExpFromExp lookupConstExp e pure $ replaceInPrimExpM onLeaf size where @@ -1112,7 +1127,12 @@ data KernelAttrs = KernelAttrs -- | Number of groups. kAttrNumGroups :: Count NumGroups SubExp, -- | Group size. - kAttrGroupSize :: Count GroupSize SubExp + kAttrGroupSize :: Count GroupSize SubExp, + -- | Variables that are specially in scope inside the kernel. + -- Operationally, these will be available at kernel compile time + -- (which happens at run-time, with access to machine-specific + -- information). + kAttrConstExps :: M.Map VName Imp.KernelConstExp } -- | The default kernel attributes. @@ -1125,7 +1145,8 @@ defKernelAttrs num_groups group_size = { kAttrFailureTolerant = False, kAttrCheckLocalMemory = True, kAttrNumGroups = num_groups, - kAttrGroupSize = group_size + kAttrGroupSize = group_size, + kAttrConstExps = mempty } getSize :: String -> SizeClass -> CallKernelGen (TV Int64) @@ -1190,12 +1211,12 @@ sKernelOp :: sKernelOp attrs constants ops name m = do HostEnv atomics _ locks <- askEnv body <- makeAllMemoryGlobal $ subImpM_ (KernelEnv atomics constants locks) ops m - uses <- computeKernelUses body mempty + uses <- computeKernelUses body $ M.keys $ kAttrConstExps attrs group_size <- onGroupSize $ kernelGroupSize constants emit . Imp.Op . Imp.CallKernel $ Imp.Kernel { Imp.kernelBody = body, - Imp.kernelUses = uses, + Imp.kernelUses = uses <> map constToUse (M.toList (kAttrConstExps attrs)), Imp.kernelNumGroups = [untyped $ kernelNumGroups constants], Imp.kernelGroupSize = [group_size], Imp.kernelName = name, @@ -1213,6 +1234,8 @@ sKernelOp attrs constants ops name m = do Just (LeafExp kc _) -> Right kc _ -> Left $ untyped e + constToUse (v, e) = Imp.ConstUse v e + sKernelFailureTolerant :: Bool -> Operations GPUMem KernelEnv Imp.KernelOp -> diff --git a/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs b/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs index 267ff24410..578d569320 100644 --- a/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs +++ b/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs @@ -1065,7 +1065,8 @@ compileSegHist :: KernelBody GPUMem -> CallKernelGen () compileSegHist (Pat pes) lvl space ops kbody = do - KernelAttrs _ _ num_groups group_size <- lvlKernelAttrs lvl + KernelAttrs {kAttrNumGroups = num_groups, kAttrGroupSize = group_size} <- + lvlKernelAttrs lvl -- Most of this function is not the histogram part itself, but -- rather figuring out whether to use a local or global memory -- strategy, as well as collapsing the subhistograms produced (which diff --git a/src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs b/src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs index 8bbdb3bee2..3d6edbc438 100644 --- a/src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs +++ b/src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs @@ -57,6 +57,7 @@ where import Control.Monad import Data.List (genericLength, zip4) +import Data.Map qualified as M import Data.Maybe import Futhark.CodeGen.ImpCode.GPU qualified as Imp import Futhark.CodeGen.ImpGen @@ -106,7 +107,8 @@ compileSegRed :: CallKernelGen () compileSegRed pat lvl space segbinops map_kbody = do emit $ Imp.DebugPrint "\n# SegRed" Nothing - KernelAttrs _ _ num_groups group_size <- lvlKernelAttrs lvl + KernelAttrs {kAttrNumGroups = num_groups, kAttrGroupSize = group_size} <- + lvlKernelAttrs lvl let grid = KernelGrid num_groups group_size compileSegRed' pat grid space segbinops $ \red_cont -> @@ -142,31 +144,32 @@ compileSegRed' pat grid space segbinops map_body_cont | genericLength segbinops > maxNumOps = compilerLimitationS $ "compileSegRed': at most " ++ show maxNumOps ++ " reduction operators are supported." - | [(_, Constant (IntValue (Int64Value 1))), _] <- unSegSpace space = - compileReduction nonsegmentedReduction | otherwise = do - let segment_size = pe64 $ last $ segSpaceDims space - use_small_segments = segment_size * 2 .<. group_size_E * chunk_E - sIf - use_small_segments - (compileReduction smallSegmentsReduction) - (compileReduction largeSegmentsReduction) + chunk_v <- dPrimV "chunk_size" . isInt64 =<< kernelConstToExp chunk_const + case unSegSpace space of + [(_, Constant (IntValue (Int64Value 1))), _] -> + compileReduction (chunk_v, chunk_const) nonsegmentedReduction + _ -> do + let segment_size = pe64 $ last $ segSpaceDims space + use_small_segments = segment_size * 2 .<. pe64 (unCount group_size) * tvExp chunk_v + sIf + use_small_segments + (compileReduction (chunk_v, chunk_const) smallSegmentsReduction) + (compileReduction (chunk_v, chunk_const) largeSegmentsReduction) where - compileReduction f = + compileReduction chunk f = f pat num_groups group_size chunk space segbinops map_body_cont - chunk - | Noncommutative <- mconcat (map segBinOpComm segbinops), - all isPrimSegBinOp segbinops = - intConst Int64 $ getChunkSize param_types - | otherwise = intConst Int64 1 - param_types = map paramType $ concatMap paramOf segbinops num_groups = gridNumGroups grid group_size = gridGroupSize grid - group_size_E = pe64 $ unCount group_size - chunk_E = pe64 chunk + + chunk_const = + if Noncommutative `elem` map segBinOpComm segbinops + && all isPrimSegBinOp segbinops + then getChunkSize param_types + else Imp.ValueExp $ IntValue $ intValue Int64 (1 :: Int64) -- | Prepare intermediate arrays for the reduction. Prim-typed -- arguments go in local memory (so we need to do the allocation of @@ -290,16 +293,16 @@ type DoCompileSegRed = Pat LetDecMem -> Count NumGroups SubExp -> Count GroupSize SubExp -> - SubExp -> + (TV Int64, Imp.KernelConstExp) -> SegSpace -> [SegBinOp GPUMem] -> DoSegBody -> CallKernelGen () nonsegmentedReduction :: DoCompileSegRed -nonsegmentedReduction (Pat segred_pes) num_groups group_size chunk_se space segbinops map_body_cont = do +nonsegmentedReduction (Pat segred_pes) num_groups group_size (chunk_v, chunk_const) space segbinops map_body_cont = do let (gtids, dims) = unzip $ unSegSpace space - chunk = pe64 chunk_se + chunk = tvExp chunk_v num_groups_se = unCount num_groups group_size_se = unCount group_size group_size' = pe64 group_size_se @@ -313,12 +316,17 @@ nonsegmentedReduction (Pat segred_pes) num_groups group_size chunk_se space segb num_threads <- fmap tvSize $ dPrimV "num_threads" $ pe64 num_groups_se * group_size' - sKernelThread "segred_nonseg" (segFlat space) (defKernelAttrs num_groups group_size) $ do + let attrs = + (defKernelAttrs num_groups group_size) + { kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const + } + + sKernelThread "segred_nonseg" (segFlat space) attrs $ do constants <- kernelConstants <$> askEnv let ltid = kernelLocalThreadId constants let group_id = kernelGroupId constants - interms <- makeIntermArrays (sExt64 group_id) group_size_se chunk_se segbinops + interms <- makeIntermArrays (sExt64 group_id) group_size_se (tvSize chunk_v) segbinops sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "local" -- Since this is the nonsegmented case, all outer segment IDs must @@ -473,7 +481,7 @@ smallSegmentsReduction (Pat segred_pes) num_groups group_size _ space segbinops sOp $ Imp.Barrier Imp.FenceLocal largeSegmentsReduction :: DoCompileSegRed -largeSegmentsReduction (Pat segred_pes) num_groups group_size chunk_se space segbinops map_body_cont = do +largeSegmentsReduction (Pat segred_pes) num_groups group_size (chunk_v, chunk_const) space segbinops map_body_cont = do let (gtids, dims) = unzip $ unSegSpace space dims' = map pe64 dims num_segments = product $ init dims' @@ -481,7 +489,7 @@ largeSegmentsReduction (Pat segred_pes) num_groups group_size chunk_se space seg num_groups' = pe64 $ unCount num_groups group_size_se = unCount group_size group_size' = pe64 group_size_se - chunk = pe64 chunk_se + chunk = tvExp chunk_v groups_per_segment <- dPrimVE "groups_per_segment" $ @@ -522,12 +530,17 @@ largeSegmentsReduction (Pat segred_pes) num_groups group_size chunk_se space seg let num_counters = maxNumOps * 1024 counters <- genZeroes "counters" $ fromIntegral num_counters - sKernelThread "segred_large" (segFlat space) (defKernelAttrs num_groups group_size) $ do + let attrs = + (defKernelAttrs num_groups group_size) + { kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const + } + + sKernelThread "segred_large" (segFlat space) attrs $ do constants <- kernelConstants <$> askEnv let group_id = sExt64 $ kernelGroupId constants ltid = kernelLocalThreadId constants - interms <- makeIntermArrays group_id group_size_se chunk_se segbinops + interms <- makeIntermArrays group_id group_size_se (tvSize chunk_v) segbinops sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "local" -- We probably do not have enough actual workgroups to cover the diff --git a/src/Futhark/CodeGen/ImpGen/GPU/SegScan/SinglePass.hs b/src/Futhark/CodeGen/ImpGen/GPU/SegScan/SinglePass.hs index accbecdc40..0adedc7f6a 100644 --- a/src/Futhark/CodeGen/ImpGen/GPU/SegScan/SinglePass.hs +++ b/src/Futhark/CodeGen/ImpGen/GPU/SegScan/SinglePass.hs @@ -7,6 +7,7 @@ module Futhark.CodeGen.ImpGen.GPU.SegScan.SinglePass (compileSegScan) where import Control.Monad import Data.List (zip4, zip7) +import Data.Map qualified as M import Data.Maybe import Futhark.CodeGen.ImpCode.GPU qualified as Imp import Futhark.CodeGen.ImpGen @@ -229,14 +230,15 @@ compileSegScan pat lvl space scan_op map_kbody = do tys' = lambdaReturnType $ segBinOpLambda scan_op - chunk :: (Num a) => a - chunk = getChunkSize tys' - tys = map elemType tys' group_size_e = pe64 $ unCount $ kAttrGroupSize attrs num_physgroups_e = pe64 $ unCount $ kAttrNumGroups attrs + let chunk_const = getChunkSize tys' + chunk_v <- dPrimV "chunk_size" . isInt64 =<< kernelConstToExp chunk_const + let chunk = tvExp chunk_v + num_virtgroups <- tvSize <$> dPrimV "num_virtgroups" (n `divUp` (group_size_e * chunk)) let num_virtgroups_e = pe64 num_virtgroups @@ -250,7 +252,7 @@ compileSegScan pat lvl space scan_op map_kbody = do not_segmented_e = fromBool $ not segmented segment_size = last dims' - emit $ Imp.DebugPrint "Sequential elements per thread (chunk)" $ Just $ untyped (chunk :: Imp.TExp Int32) + emit $ Imp.DebugPrint "Sequential elements per thread (chunk)" $ Just $ untyped chunk statusFlags <- sAllocArray "status_flags" int8 (Shape [num_virtgroups]) (Space "device") sReplicate statusFlags $ intConst Int8 statusX @@ -264,14 +266,18 @@ compileSegScan pat lvl space scan_op map_kbody = do global_id <- genZeroes "global_dynid" 1 - sKernelThread "segscan" (segFlat space) attrs $ do + let attrs' = attrs {kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const} + + sKernelThread "segscan" (segFlat space) attrs' $ do + chunk32 <- dPrimVE "chunk_size_32b" $ sExt32 $ tvExp chunk_v + constants <- kernelConstants <$> askEnv let ltid32 = kernelLocalThreadId constants ltid = sExt64 ltid32 (sharedId, transposedArrays, prefixArrays, warpscan, exchanges) <- - createLocalArrays (kAttrGroupSize attrs) (intConst Int64 chunk) tys + createLocalArrays (kAttrGroupSize attrs) (tvSize chunk_v) tys -- We wrap the entire kernel body in a virtualisation loop to handle the -- case where we do not have enough workgroups to cover the iteration space. @@ -333,8 +339,8 @@ compileSegScan pat lvl space scan_op map_kbody = do sAllocArray "private" ty - (Shape [intConst Int64 chunk]) - (ScalarSpace [intConst Int64 chunk] ty) + (Shape [tvSize chunk_v]) + (ScalarSpace [tvSize chunk_v] ty) thd_offset <- dPrimVE "thd_offset" $ block_offset + ltid @@ -385,7 +391,7 @@ compileSegScan pat lvl space scan_op map_kbody = do new_sgm <- if segmented then do - gidx <- dPrimVE "gidx" $ (ltid32 * chunk) + 1 + gidx <- dPrimVE "gidx" $ (ltid32 * chunk32) + 1 dPrimVE "new_sgm" $ (gidx + sExt32 i - boundary) `mod` segsize_compact .==. 0 else pure false -- skip scan of first element in segment @@ -408,8 +414,8 @@ compileSegScan pat lvl space scan_op map_kbody = do let crossesSegment = do guard segmented Just $ \from to -> - let from' = (from + 1) * chunk - 1 - to' = (to + 1) * chunk - 1 + let from' = (from + 1) * chunk32 - 1 + to' = (to + 1) * chunk32 - 1 in (to' - from') .>. (to' + segsize_compact - boundary) `mod` segsize_compact scan_op1 <- renameLambda $ segBinOpLambda scan_op @@ -606,7 +612,7 @@ compileSegScan pat lvl space scan_op map_kbody = do dPrimV_ y' $ tvExp acc sIf - (ltid32 * chunk .<. boundary .&&. bNot blockNewSgm) + (ltid32 * chunk32 .<. boundary .&&. bNot blockNewSgm) ( compileStms mempty (bodyStms $ lambdaBody scan_op4) $ forM_ (zip3 xs tys $ map resSubExp $ bodyResult $ lambdaBody scan_op4) $ \(x, ty, res) -> x <~~ toExp' ty res @@ -616,7 +622,7 @@ compileSegScan pat lvl space scan_op map_kbody = do -- elements left before new segment. stop <- dPrimVE "stopping_point" $ - segsize_compact - (ltid32 * chunk - 1 + segsize_compact - boundary) `rem` segsize_compact + segsize_compact - (ltid32 * chunk32 - 1 + segsize_compact - boundary) `rem` segsize_compact sFor "i" chunk $ \i -> do sWhen (sExt32 i .<. stop - 1) $ do forM_ (zip private_chunks ys) $ \(src, y) -> diff --git a/src/Futhark/CodeGen/ImpGen/GPU/ToOpenCL.hs b/src/Futhark/CodeGen/ImpGen/GPU/ToOpenCL.hs index 5675063a16..df5f160bda 100644 --- a/src/Futhark/CodeGen/ImpGen/GPU/ToOpenCL.hs +++ b/src/Futhark/CodeGen/ImpGen/GPU/ToOpenCL.hs @@ -333,7 +333,7 @@ ensureDeviceFuns code = do isConst :: GroupDim -> Maybe T.Text isConst (Left (ValueExp (IntValue x))) = Just $ prettyText $ intToInt64 x -isConst (Right (SizeConst v)) = +isConst (Right (SizeConst v _)) = Just $ zEncodeText $ nameToText v isConst (Right (SizeMaxConst size_class)) = Just $ "max_" <> prettyText size_class