Skip to content

Commit

Permalink
Fix #2051. (#2052)
Browse files Browse the repository at this point in the history
  • Loading branch information
athas authored Nov 30, 2023
1 parent c98120a commit 5a2e404
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 12 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
* Incorrect type checking of `let` binding with explicit size
quantification, where size appears in type of body (#2048).

* GPU code generation for non-commutative non-segmented reductions
with array operands (#2051).

## [0.25.9]

### Added
Expand Down
2 changes: 1 addition & 1 deletion src/Futhark/CodeGen/ImpGen/GPU/Base.hs
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,7 @@ groupReduceWithOffset offset w lam arrs = do
| Prim _ <- paramType param =
pure ()
| otherwise =
copyDWIMFix arr [0] (Var $ paramName param) []
copyDWIMFix arr [sExt64 local_tid] (Var $ paramName param) []

let (reduce_acc_params, reduce_arr_params) =
splitAt (length arrs) $ lambdaParams lam
Expand Down
21 changes: 10 additions & 11 deletions src/Futhark/CodeGen/ImpGen/GPU/SegRed.hs
Original file line number Diff line number Diff line change
Expand Up @@ -130,19 +130,19 @@ compileSegRed' pat grid space reds body
-- performed. This policy is baked into how the allocations are done
-- in ExplicitAllocations.
intermediateArrays ::
Imp.TExp Int64 ->
Count GroupSize SubExp ->
SubExp ->
SegBinOp GPUMem ->
InKernelGen [VName]
intermediateArrays (Count group_size) num_threads (SegBinOp _ red_op nes _) = do
intermediateArrays group_id (Count group_size) (SegBinOp _ red_op nes _) = do
let red_op_params = lambdaParams red_op
(red_acc_params, _) = splitAt (length nes) red_op_params
forM red_acc_params $ \p ->
case paramDec p of
MemArray pt shape _ (ArrayIn mem _) -> do
let shape' = Shape [num_threads] <> shape
let shape' = Shape [group_size] <> shape
sArray "red_arr" pt shape' mem $
LMAD.iota 0 (map pe64 $ shapeDims shape')
LMAD.iota (group_id * product (map pe64 (shapeDims shape'))) (map pe64 $ shapeDims shape')
_ -> do
let pt = elemType $ paramType p
shape = Shape [group_size]
Expand Down Expand Up @@ -200,7 +200,8 @@ nonsegmentedReduction segred_pat num_groups group_size space reds body = do
sKernelThread "segred_nonseg" (segFlat space) (defKernelAttrs num_groups group_size) $ do
constants <- kernelConstants <$> askEnv
sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "local"
reds_arrs <- mapM (intermediateArrays group_size (tvSize num_threads)) reds
reds_arrs <-
mapM (intermediateArrays (sExt64 $ kernelGroupId constants) group_size) reds

-- Since this is the nonsegmented case, all outer segment IDs must
-- necessarily be 0.
Expand Down Expand Up @@ -283,7 +284,8 @@ smallSegmentsReduction (Pat segred_pes) num_groups group_size space reds body =

sKernelThread "segred_small" (segFlat space) (defKernelAttrs num_groups group_size) $ do
constants <- kernelConstants <$> askEnv
reds_arrs <- mapM (intermediateArrays group_size (Var $ tvVar num_threads)) reds
reds_arrs <-
mapM (intermediateArrays (sExt64 $ kernelGroupId constants) group_size) reds

-- We probably do not have enough actual workgroups to cover the
-- entire iteration space. Some groups thus have to perform double
Expand Down Expand Up @@ -392,10 +394,6 @@ largeSegmentsReduction segred_pat num_groups group_size space reds body = do
dPrimV "virt_num_groups" $
groups_per_segment * num_segments

num_threads <-
dPrimV "num_threads" $
unCount num_groups' * unCount group_size'

threads_per_segment <-
dPrimV "threads_per_segment" $
groups_per_segment * unCount group_size'
Expand Down Expand Up @@ -426,7 +424,8 @@ largeSegmentsReduction segred_pat num_groups group_size space reds body = do

sKernelThread "segred_large" (segFlat space) (defKernelAttrs num_groups group_size) $ do
constants <- kernelConstants <$> askEnv
reds_arrs <- mapM (intermediateArrays group_size (tvSize num_threads)) reds
reds_arrs <-
mapM (intermediateArrays (sExt64 $ kernelGroupId constants) group_size) reds
sync_arr <- sAllocArray "sync_arr" Bool (Shape [intConst Int32 1]) $ Space "local"

-- We probably do not have enough actual workgroups to cover the
Expand Down
12 changes: 12 additions & 0 deletions tests/soacs/reduce9.fut
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
-- ==
-- compiled random input { [512][32]i32 } auto output
-- compiled random input { [1024][1024]i32 } auto output

def main [n][m] (xss: [n][m]i32): []i32 =
reduce (\(xs: []i32) ys ->
loop zs = replicate m 0 for i < m do
let zs[i] = xs[i] + ys[i]
in zs
)
(replicate m 0)
xss

0 comments on commit 5a2e404

Please sign in to comment.