From 5c6af293d6461972a28bb46cd2c13edc9248a7d1 Mon Sep 17 00:00:00 2001
From: Troels Henriksen <athas@sigkill.dk>
Date: Wed, 6 Dec 2023 23:05:38 +0100
Subject: [PATCH] Query register capacity for segred and segscan codegen.

This is very tedious code, and required adding the notion of "kernel
constant expressions", as we have some expressions that _must_ be
constant at kernel compilation time (which is at program runtime). We
actually had this notion in the ImpCode representation, but now ImpGen
provides some manual control as well.
---
 src/Futhark/CodeGen/Backends/GPU.hs           |  2 +-
 src/Futhark/CodeGen/Backends/PyOpenCL.hs      |  2 +-
 .../CodeGen/Backends/PyOpenCL/Boilerplate.hs  |  2 +-
 src/Futhark/CodeGen/ImpCode/GPU.hs            | 10 ++-
 src/Futhark/CodeGen/ImpGen/GPU/Base.hs        | 77 ++++++++++++-------
 src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs     |  3 +-
 src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs      | 67 +++++++++-------
 .../CodeGen/ImpGen/GPU/SegScan/SinglePass.hs  | 32 ++++----
 src/Futhark/CodeGen/ImpGen/GPU/ToOpenCL.hs    |  2 +-
 9 files changed, 121 insertions(+), 76 deletions(-)

diff --git a/src/Futhark/CodeGen/Backends/GPU.hs b/src/Futhark/CodeGen/Backends/GPU.hs
index 9dd1859e2f..a2afba4321 100644
--- a/src/Futhark/CodeGen/Backends/GPU.hs
+++ b/src/Futhark/CodeGen/Backends/GPU.hs
@@ -71,7 +71,7 @@ getParamByKey :: Name -> C.Exp
 getParamByKey key = [C.cexp|*ctx->tuning_params.$id:key|]
 
 kernelConstToExp :: KernelConst -> C.Exp
-kernelConstToExp (SizeConst key) =
+kernelConstToExp (SizeConst key _) =
   getParamByKey key
 kernelConstToExp (SizeMaxConst size_class) =
   [C.cexp|ctx->$id:field|]
diff --git a/src/Futhark/CodeGen/Backends/PyOpenCL.hs b/src/Futhark/CodeGen/Backends/PyOpenCL.hs
index 716aaeb355..3d2d6a85e1 100644
--- a/src/Futhark/CodeGen/Backends/PyOpenCL.hs
+++ b/src/Futhark/CodeGen/Backends/PyOpenCL.hs
@@ -207,7 +207,7 @@ getParamByKey :: Name -> PyExp
 getParamByKey key = Index (Var "self.sizes") (IdxExp $ String $ prettyText key)
 
 kernelConstToExp :: Imp.KernelConst -> PyExp
-kernelConstToExp (Imp.SizeConst key) =
+kernelConstToExp (Imp.SizeConst key _) =
   getParamByKey key
 kernelConstToExp (Imp.SizeMaxConst size_class) =
   Var $ "self.max_" <> prettyString size_class
diff --git a/src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs b/src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs
index 65e6f65e9c..aec8fda35c 100644
--- a/src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs
+++ b/src/Futhark/CodeGen/Backends/PyOpenCL/Boilerplate.hs
@@ -34,7 +34,7 @@ getParamByKey :: Name -> PyExp
 getParamByKey key = Index (Var "self.sizes") (IdxExp $ String $ prettyText key)
 
 kernelConstToExp :: KernelConst -> PyExp
-kernelConstToExp (SizeConst key) =
+kernelConstToExp (SizeConst key _) =
   getParamByKey key
 kernelConstToExp (SizeMaxConst size_class) =
   Var $ "self.max_" <> prettyString size_class
diff --git a/src/Futhark/CodeGen/ImpCode/GPU.hs b/src/Futhark/CodeGen/ImpCode/GPU.hs
index 6241e17de5..d526c9e2de 100644
--- a/src/Futhark/CodeGen/ImpCode/GPU.hs
+++ b/src/Futhark/CodeGen/ImpCode/GPU.hs
@@ -34,7 +34,7 @@ type KernelCode = Code KernelOp
 
 -- | A run-time constant related to kernels.
 data KernelConst
-  = SizeConst Name
+  = SizeConst Name SizeClass
   | SizeMaxConst SizeClass
   deriving (Eq, Ord, Show)
 
@@ -85,11 +85,13 @@ data KernelUse
   deriving (Eq, Ord, Show)
 
 instance Pretty KernelConst where
-  pretty (SizeConst key) = "get_size" <> parens (pretty key)
-  pretty (SizeMaxConst size_class) = "get_max_size" <> parens (pretty size_class)
+  pretty (SizeConst key size_class) =
+    "get_size" <> parens (commasep [pretty key, pretty size_class])
+  pretty (SizeMaxConst size_class) =
+    "get_max_size" <> parens (pretty size_class)
 
 instance FreeIn KernelConst where
-  freeIn' (SizeConst _) = mempty
+  freeIn' SizeConst {} = mempty
   freeIn' (SizeMaxConst _) = mempty
 
 instance Pretty KernelUse where
diff --git a/src/Futhark/CodeGen/ImpGen/GPU/Base.hs b/src/Futhark/CodeGen/ImpGen/GPU/Base.hs
index 34b109e2b9..2dc29789d1 100644
--- a/src/Futhark/CodeGen/ImpGen/GPU/Base.hs
+++ b/src/Futhark/CodeGen/ImpGen/GPU/Base.hs
@@ -30,6 +30,7 @@ module Futhark.CodeGen.ImpGen.GPU.Base
     updateAcc,
     genZeroes,
     isPrimParam,
+    kernelConstToExp,
     getChunkSize,
 
     -- * Host-level bulk operations
@@ -261,26 +262,40 @@ fenceForArrays = fmap (foldl' max Imp.FenceLocal) . mapM need
 isPrimParam :: (Typed p) => Param p -> Bool
 isPrimParam = primType . paramType
 
--- | Given a list of parameter types, compute the largest available chunk size
--- given the parameters for which we want chunking and the available resources.
--- Used in SegScan.SinglePass.compileSegScan, and SegRed.compileSegRed (with
--- primitive non-commutative operators only).
-getChunkSize :: (Num a) => [Type] -> a
-getChunkSize types = fromInteger $ max 1 $ min mem_constraint reg_constraint
+kernelConstToExp :: Imp.KernelConstExp -> CallKernelGen Imp.Exp
+kernelConstToExp = traverse f
   where
-    types' = map elemType $ filter primType types
-    sizes = map primByteSize types'
-
-    sum_sizes = sum sizes
-    sum_sizes' = sum (map (max 4 . primByteSize) types') `div` 4
-    max_size = maximum sizes
-
-    mem_constraint = max k_mem sum_sizes `div` max_size
-    reg_constraint = (k_reg - 1 - sum_sizes') `div` (2 * sum_sizes')
-
-    -- TODO: Make these constants dynamic by querying device
-    k_reg = 64
-    k_mem = 95
+    f (Imp.SizeMaxConst c) = do
+      v <- dPrim (prettyString c) int64
+      sOp $ Imp.GetSizeMax (tvVar v) c
+      pure $ tvVar v
+    f (Imp.SizeConst k c) = do
+      v <- dPrim (nameToString k) int64
+      sOp $ Imp.GetSize (tvVar v) k c
+      pure $ tvVar v
+
+-- | Given available register and cacha list of parameter types,
+-- compute the largest available chunk size given the parameters for
+-- which we want chunking and the available resources. Used in
+-- 'SegScan.SinglePass.compileSegScan', and 'SegRed.compileSegRed'
+-- (with primitive non-commutative operators only).
+getChunkSize :: [Type] -> Imp.KernelConstExp
+getChunkSize types = do
+  let max_group_size = Imp.SizeMaxConst SizeGroup
+      max_group_mem = Imp.SizeMaxConst SizeLocalMemory
+      max_group_reg = Imp.SizeMaxConst SizeRegisters
+      k_mem = le64 max_group_mem `quot` le64 max_group_size
+      k_reg = le64 max_group_reg `quot` le64 max_group_size
+      types' = map elemType $ filter primType types
+      sizes = map primByteSize types'
+
+      sum_sizes = sum sizes
+      sum_sizes' = sum (map (sMax64 4 . primByteSize) types') `quot` 4
+      max_size = maximum sizes
+
+      mem_constraint = max k_mem sum_sizes `quot` max_size
+      reg_constraint = (k_reg - 1 - sum_sizes') `quot` (2 * sum_sizes')
+  untyped $ sMax64 1 $ sMin64 mem_constraint reg_constraint
 
 inBlockScan ::
   KernelConstants ->
@@ -920,10 +935,10 @@ isConstExp vtable size = do
   let onLeaf name _ = lookupConstExp name
       lookupConstExp name =
         constExp =<< hasExp =<< M.lookup name vtable
-      constExp (Op (Inner (SizeOp (GetSize key _)))) =
-        Just $ LeafExp (Imp.SizeConst $ keyWithEntryPoint fname key) int32
-      constExp (Op (Inner (SizeOp (GetSizeMax size_class)))) =
-        Just $ LeafExp (Imp.SizeMaxConst size_class) int32
+      constExp (Op (Inner (SizeOp (GetSize key c)))) =
+        Just $ LeafExp (Imp.SizeConst (keyWithEntryPoint fname key) c) int32
+      constExp (Op (Inner (SizeOp (GetSizeMax c)))) =
+        Just $ LeafExp (Imp.SizeMaxConst c) int32
       constExp e = primExpFromExp lookupConstExp e
   pure $ replaceInPrimExpM onLeaf size
   where
@@ -1112,7 +1127,12 @@ data KernelAttrs = KernelAttrs
     -- | Number of groups.
     kAttrNumGroups :: Count NumGroups SubExp,
     -- | Group size.
-    kAttrGroupSize :: Count GroupSize SubExp
+    kAttrGroupSize :: Count GroupSize SubExp,
+    -- | Variables that are specially in scope inside the kernel.
+    -- Operationally, these will be available at kernel compile time
+    -- (which happens at run-time, with access to machine-specific
+    -- information).
+    kAttrConstExps :: M.Map VName Imp.KernelConstExp
   }
 
 -- | The default kernel attributes.
@@ -1125,7 +1145,8 @@ defKernelAttrs num_groups group_size =
     { kAttrFailureTolerant = False,
       kAttrCheckLocalMemory = True,
       kAttrNumGroups = num_groups,
-      kAttrGroupSize = group_size
+      kAttrGroupSize = group_size,
+      kAttrConstExps = mempty
     }
 
 getSize :: String -> SizeClass -> CallKernelGen (TV Int64)
@@ -1190,12 +1211,12 @@ sKernelOp ::
 sKernelOp attrs constants ops name m = do
   HostEnv atomics _ locks <- askEnv
   body <- makeAllMemoryGlobal $ subImpM_ (KernelEnv atomics constants locks) ops m
-  uses <- computeKernelUses body mempty
+  uses <- computeKernelUses body $ M.keys $ kAttrConstExps attrs
   group_size <- onGroupSize $ kernelGroupSize constants
   emit . Imp.Op . Imp.CallKernel $
     Imp.Kernel
       { Imp.kernelBody = body,
-        Imp.kernelUses = uses,
+        Imp.kernelUses = uses <> map constToUse (M.toList (kAttrConstExps attrs)),
         Imp.kernelNumGroups = [untyped $ kernelNumGroups constants],
         Imp.kernelGroupSize = [group_size],
         Imp.kernelName = name,
@@ -1213,6 +1234,8 @@ sKernelOp attrs constants ops name m = do
           Just (LeafExp kc _) -> Right kc
           _ -> Left $ untyped e
 
+    constToUse (v, e) = Imp.ConstUse v e
+
 sKernelFailureTolerant ::
   Bool ->
   Operations GPUMem KernelEnv Imp.KernelOp ->
diff --git a/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs b/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs
index 267ff24410..578d569320 100644
--- a/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs
+++ b/src/Futhark/CodeGen/ImpGen/GPU/SegHist.hs
@@ -1065,7 +1065,8 @@ compileSegHist ::
   KernelBody GPUMem ->
   CallKernelGen ()
 compileSegHist (Pat pes) lvl space ops kbody = do
-  KernelAttrs _ _ num_groups group_size <- lvlKernelAttrs lvl
+  KernelAttrs {kAttrNumGroups = num_groups, kAttrGroupSize = group_size} <-
+    lvlKernelAttrs lvl
   -- Most of this function is not the histogram part itself, but
   -- rather figuring out whether to use a local or global memory
   -- strategy, as well as collapsing the subhistograms produced (which
diff --git a/src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs b/src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs
index 8bbdb3bee2..3d6edbc438 100644
--- a/src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs
+++ b/src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs
@@ -57,6 +57,7 @@ where
 
 import Control.Monad
 import Data.List (genericLength, zip4)
+import Data.Map qualified as M
 import Data.Maybe
 import Futhark.CodeGen.ImpCode.GPU qualified as Imp
 import Futhark.CodeGen.ImpGen
@@ -106,7 +107,8 @@ compileSegRed ::
   CallKernelGen ()
 compileSegRed pat lvl space segbinops map_kbody = do
   emit $ Imp.DebugPrint "\n# SegRed" Nothing
-  KernelAttrs _ _ num_groups group_size <- lvlKernelAttrs lvl
+  KernelAttrs {kAttrNumGroups = num_groups, kAttrGroupSize = group_size} <-
+    lvlKernelAttrs lvl
   let grid = KernelGrid num_groups group_size
 
   compileSegRed' pat grid space segbinops $ \red_cont ->
@@ -142,31 +144,32 @@ compileSegRed' pat grid space segbinops map_body_cont
   | genericLength segbinops > maxNumOps =
       compilerLimitationS $
         "compileSegRed': at most " ++ show maxNumOps ++ " reduction operators are supported."
-  | [(_, Constant (IntValue (Int64Value 1))), _] <- unSegSpace space =
-      compileReduction nonsegmentedReduction
   | otherwise = do
-      let segment_size = pe64 $ last $ segSpaceDims space
-          use_small_segments = segment_size * 2 .<. group_size_E * chunk_E
-      sIf
-        use_small_segments
-        (compileReduction smallSegmentsReduction)
-        (compileReduction largeSegmentsReduction)
+      chunk_v <- dPrimV "chunk_size" . isInt64 =<< kernelConstToExp chunk_const
+      case unSegSpace space of
+        [(_, Constant (IntValue (Int64Value 1))), _] ->
+          compileReduction (chunk_v, chunk_const) nonsegmentedReduction
+        _ -> do
+          let segment_size = pe64 $ last $ segSpaceDims space
+              use_small_segments = segment_size * 2 .<. pe64 (unCount group_size) * tvExp chunk_v
+          sIf
+            use_small_segments
+            (compileReduction (chunk_v, chunk_const) smallSegmentsReduction)
+            (compileReduction (chunk_v, chunk_const) largeSegmentsReduction)
   where
-    compileReduction f =
+    compileReduction chunk f =
       f pat num_groups group_size chunk space segbinops map_body_cont
 
-    chunk
-      | Noncommutative <- mconcat (map segBinOpComm segbinops),
-        all isPrimSegBinOp segbinops =
-          intConst Int64 $ getChunkSize param_types
-      | otherwise = intConst Int64 1
-
     param_types = map paramType $ concatMap paramOf segbinops
 
     num_groups = gridNumGroups grid
     group_size = gridGroupSize grid
-    group_size_E = pe64 $ unCount group_size
-    chunk_E = pe64 chunk
+
+    chunk_const =
+      if Noncommutative `elem` map segBinOpComm segbinops
+        && all isPrimSegBinOp segbinops
+        then getChunkSize param_types
+        else Imp.ValueExp $ IntValue $ intValue Int64 (1 :: Int64)
 
 -- | Prepare intermediate arrays for the reduction.  Prim-typed
 -- arguments go in local memory (so we need to do the allocation of
@@ -290,16 +293,16 @@ type DoCompileSegRed =
   Pat LetDecMem ->
   Count NumGroups SubExp ->
   Count GroupSize SubExp ->
-  SubExp ->
+  (TV Int64, Imp.KernelConstExp) ->
   SegSpace ->
   [SegBinOp GPUMem] ->
   DoSegBody ->
   CallKernelGen ()
 
 nonsegmentedReduction :: DoCompileSegRed
-nonsegmentedReduction (Pat segred_pes) num_groups group_size chunk_se space segbinops map_body_cont = do
+nonsegmentedReduction (Pat segred_pes) num_groups group_size (chunk_v, chunk_const) space segbinops map_body_cont = do
   let (gtids, dims) = unzip $ unSegSpace space
-      chunk = pe64 chunk_se
+      chunk = tvExp chunk_v
       num_groups_se = unCount num_groups
       group_size_se = unCount group_size
       group_size' = pe64 group_size_se
@@ -313,12 +316,17 @@ nonsegmentedReduction (Pat segred_pes) num_groups group_size chunk_se space segb
   num_threads <-
     fmap tvSize $ dPrimV "num_threads" $ pe64 num_groups_se * group_size'
 
-  sKernelThread "segred_nonseg" (segFlat space) (defKernelAttrs num_groups group_size) $ do
+  let attrs =
+        (defKernelAttrs num_groups group_size)
+          { kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const
+          }
+
+  sKernelThread "segred_nonseg" (segFlat space) attrs $ do
     constants <- kernelConstants <$> askEnv
     let ltid = kernelLocalThreadId constants
     let group_id = kernelGroupId constants
 
-    interms <- makeIntermArrays (sExt64 group_id) group_size_se chunk_se segbinops
+    interms <- makeIntermArrays (sExt64 group_id) group_size_se (tvSize chunk_v) segbinops
     sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "local"
 
     -- Since this is the nonsegmented case, all outer segment IDs must
@@ -473,7 +481,7 @@ smallSegmentsReduction (Pat segred_pes) num_groups group_size _ space segbinops
       sOp $ Imp.Barrier Imp.FenceLocal
 
 largeSegmentsReduction :: DoCompileSegRed
-largeSegmentsReduction (Pat segred_pes) num_groups group_size chunk_se space segbinops map_body_cont = do
+largeSegmentsReduction (Pat segred_pes) num_groups group_size (chunk_v, chunk_const) space segbinops map_body_cont = do
   let (gtids, dims) = unzip $ unSegSpace space
       dims' = map pe64 dims
       num_segments = product $ init dims'
@@ -481,7 +489,7 @@ largeSegmentsReduction (Pat segred_pes) num_groups group_size chunk_se space seg
       num_groups' = pe64 $ unCount num_groups
       group_size_se = unCount group_size
       group_size' = pe64 group_size_se
-      chunk = pe64 chunk_se
+      chunk = tvExp chunk_v
 
   groups_per_segment <-
     dPrimVE "groups_per_segment" $
@@ -522,12 +530,17 @@ largeSegmentsReduction (Pat segred_pes) num_groups group_size chunk_se space seg
   let num_counters = maxNumOps * 1024
   counters <- genZeroes "counters" $ fromIntegral num_counters
 
-  sKernelThread "segred_large" (segFlat space) (defKernelAttrs num_groups group_size) $ do
+  let attrs =
+        (defKernelAttrs num_groups group_size)
+          { kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const
+          }
+
+  sKernelThread "segred_large" (segFlat space) attrs $ do
     constants <- kernelConstants <$> askEnv
     let group_id = sExt64 $ kernelGroupId constants
         ltid = kernelLocalThreadId constants
 
-    interms <- makeIntermArrays group_id group_size_se chunk_se segbinops
+    interms <- makeIntermArrays group_id group_size_se (tvSize chunk_v) segbinops
     sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "local"
 
     -- We probably do not have enough actual workgroups to cover the
diff --git a/src/Futhark/CodeGen/ImpGen/GPU/SegScan/SinglePass.hs b/src/Futhark/CodeGen/ImpGen/GPU/SegScan/SinglePass.hs
index accbecdc40..0adedc7f6a 100644
--- a/src/Futhark/CodeGen/ImpGen/GPU/SegScan/SinglePass.hs
+++ b/src/Futhark/CodeGen/ImpGen/GPU/SegScan/SinglePass.hs
@@ -7,6 +7,7 @@ module Futhark.CodeGen.ImpGen.GPU.SegScan.SinglePass (compileSegScan) where
 
 import Control.Monad
 import Data.List (zip4, zip7)
+import Data.Map qualified as M
 import Data.Maybe
 import Futhark.CodeGen.ImpCode.GPU qualified as Imp
 import Futhark.CodeGen.ImpGen
@@ -229,14 +230,15 @@ compileSegScan pat lvl space scan_op map_kbody = do
 
       tys' = lambdaReturnType $ segBinOpLambda scan_op
 
-      chunk :: (Num a) => a
-      chunk = getChunkSize tys'
-
       tys = map elemType tys'
 
       group_size_e = pe64 $ unCount $ kAttrGroupSize attrs
       num_physgroups_e = pe64 $ unCount $ kAttrNumGroups attrs
 
+  let chunk_const = getChunkSize tys'
+  chunk_v <- dPrimV "chunk_size" . isInt64 =<< kernelConstToExp chunk_const
+  let chunk = tvExp chunk_v
+
   num_virtgroups <-
     tvSize <$> dPrimV "num_virtgroups" (n `divUp` (group_size_e * chunk))
   let num_virtgroups_e = pe64 num_virtgroups
@@ -250,7 +252,7 @@ compileSegScan pat lvl space scan_op map_kbody = do
       not_segmented_e = fromBool $ not segmented
       segment_size = last dims'
 
-  emit $ Imp.DebugPrint "Sequential elements per thread (chunk)" $ Just $ untyped (chunk :: Imp.TExp Int32)
+  emit $ Imp.DebugPrint "Sequential elements per thread (chunk)" $ Just $ untyped chunk
 
   statusFlags <- sAllocArray "status_flags" int8 (Shape [num_virtgroups]) (Space "device")
   sReplicate statusFlags $ intConst Int8 statusX
@@ -264,14 +266,18 @@ compileSegScan pat lvl space scan_op map_kbody = do
 
   global_id <- genZeroes "global_dynid" 1
 
-  sKernelThread "segscan" (segFlat space) attrs $ do
+  let attrs' = attrs {kAttrConstExps = M.singleton (tvVar chunk_v) chunk_const}
+
+  sKernelThread "segscan" (segFlat space) attrs' $ do
+    chunk32 <- dPrimVE "chunk_size_32b" $ sExt32 $ tvExp chunk_v
+
     constants <- kernelConstants <$> askEnv
 
     let ltid32 = kernelLocalThreadId constants
         ltid = sExt64 ltid32
 
     (sharedId, transposedArrays, prefixArrays, warpscan, exchanges) <-
-      createLocalArrays (kAttrGroupSize attrs) (intConst Int64 chunk) tys
+      createLocalArrays (kAttrGroupSize attrs) (tvSize chunk_v) tys
 
     -- We wrap the entire kernel body in a virtualisation loop to handle the
     -- case where we do not have enough workgroups to cover the iteration space.
@@ -333,8 +339,8 @@ compileSegScan pat lvl space scan_op map_kbody = do
           sAllocArray
             "private"
             ty
-            (Shape [intConst Int64 chunk])
-            (ScalarSpace [intConst Int64 chunk] ty)
+            (Shape [tvSize chunk_v])
+            (ScalarSpace [tvSize chunk_v] ty)
 
       thd_offset <- dPrimVE "thd_offset" $ block_offset + ltid
 
@@ -385,7 +391,7 @@ compileSegScan pat lvl space scan_op map_kbody = do
           new_sgm <-
             if segmented
               then do
-                gidx <- dPrimVE "gidx" $ (ltid32 * chunk) + 1
+                gidx <- dPrimVE "gidx" $ (ltid32 * chunk32) + 1
                 dPrimVE "new_sgm" $ (gidx + sExt32 i - boundary) `mod` segsize_compact .==. 0
               else pure false
           -- skip scan of first element in segment
@@ -408,8 +414,8 @@ compileSegScan pat lvl space scan_op map_kbody = do
       let crossesSegment = do
             guard segmented
             Just $ \from to ->
-              let from' = (from + 1) * chunk - 1
-                  to' = (to + 1) * chunk - 1
+              let from' = (from + 1) * chunk32 - 1
+                  to' = (to + 1) * chunk32 - 1
                in (to' - from') .>. (to' + segsize_compact - boundary) `mod` segsize_compact
 
       scan_op1 <- renameLambda $ segBinOpLambda scan_op
@@ -606,7 +612,7 @@ compileSegScan pat lvl space scan_op map_kbody = do
             dPrimV_ y' $ tvExp acc
 
         sIf
-          (ltid32 * chunk .<. boundary .&&. bNot blockNewSgm)
+          (ltid32 * chunk32 .<. boundary .&&. bNot blockNewSgm)
           ( compileStms mempty (bodyStms $ lambdaBody scan_op4) $
               forM_ (zip3 xs tys $ map resSubExp $ bodyResult $ lambdaBody scan_op4) $
                 \(x, ty, res) -> x <~~ toExp' ty res
@@ -616,7 +622,7 @@ compileSegScan pat lvl space scan_op map_kbody = do
         -- elements left before new segment.
         stop <-
           dPrimVE "stopping_point" $
-            segsize_compact - (ltid32 * chunk - 1 + segsize_compact - boundary) `rem` segsize_compact
+            segsize_compact - (ltid32 * chunk32 - 1 + segsize_compact - boundary) `rem` segsize_compact
         sFor "i" chunk $ \i -> do
           sWhen (sExt32 i .<. stop - 1) $ do
             forM_ (zip private_chunks ys) $ \(src, y) ->
diff --git a/src/Futhark/CodeGen/ImpGen/GPU/ToOpenCL.hs b/src/Futhark/CodeGen/ImpGen/GPU/ToOpenCL.hs
index 5675063a16..df5f160bda 100644
--- a/src/Futhark/CodeGen/ImpGen/GPU/ToOpenCL.hs
+++ b/src/Futhark/CodeGen/ImpGen/GPU/ToOpenCL.hs
@@ -333,7 +333,7 @@ ensureDeviceFuns code = do
 isConst :: GroupDim -> Maybe T.Text
 isConst (Left (ValueExp (IntValue x))) =
   Just $ prettyText $ intToInt64 x
-isConst (Right (SizeConst v)) =
+isConst (Right (SizeConst v _)) =
   Just $ zEncodeText $ nameToText v
 isConst (Right (SizeMaxConst size_class)) =
   Just $ "max_" <> prettyText size_class