Skip to content

Commit a1aa557

Browse files
committed
Parallelize by a factor of 1
CPU's threading model is distinct from GPU's thread group model: GPU shared memory is not shared beyond one GPU thread group. Whenever nested parallelism is enabled in the Mullapudi2016 auto-scheduler, always implement parallelizable loop dimensions as `gpu_block`. This can be implemented by splitting the dimensions by a factor 1: `f.split(z, zi, zo, 1)`. This makes the autoscheduler's `last_level_cache` estimates per GPU warp more robust against variations of the nested parallelism. In the folder `*/apps/`, remove all manual override of `last_level_cache_size`. Use the default estimate: 47kB per thread group.
1 parent e46ac2c commit a1aa557

File tree

6 files changed

+7
-7
lines changed

6 files changed

+7
-7
lines changed

apps/bgu/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ add_halide_library(bgu FROM bgu.generator)
1919
add_halide_library(bgu_auto_schedule FROM bgu.generator
2020
GENERATOR bgu
2121
AUTOSCHEDULER Halide::Mullapudi2016
22-
PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
22+
PARAMS autoscheduler.experimental_gpu_schedule=1)
2323

2424
# Main executable
2525
add_executable(bgu_filter filter.cpp)

apps/harris/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ add_halide_library(harris FROM harris.generator)
1919
add_halide_library(harris_auto_schedule FROM harris.generator
2020
GENERATOR harris
2121
AUTOSCHEDULER Halide::Mullapudi2016
22-
PARAMS autoscheduler.last_level_cache_size=20000 autoscheduler.experimental_gpu_schedule=1)
22+
PARAMS autoscheduler.experimental_gpu_schedule=1)
2323

2424
# Main executable
2525
add_executable(harris_filter filter.cpp)

apps/iir_blur/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ add_halide_library(iir_blur FROM iir_blur.generator)
1919
add_halide_library(iir_blur_auto_schedule FROM iir_blur.generator
2020
GENERATOR iir_blur
2121
AUTOSCHEDULER Halide::Mullapudi2016
22-
PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
22+
PARAMS autoscheduler.experimental_gpu_schedule=1)
2323

2424
# Main executable
2525
add_executable(iir_blur_filter filter.cpp)

apps/lens_blur/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ add_halide_library(lens_blur FROM lens_blur.generator)
1919
add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
2020
GENERATOR lens_blur
2121
AUTOSCHEDULER Halide::Mullapudi2016
22-
PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
22+
PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
2323

2424
# Main executable
2525
add_executable(lens_blur_filter process.cpp)

apps/stencil_chain/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
2121
AUTOSCHEDULER Halide::Mullapudi2016
2222
# When target=host-cuda or host-metal, limit the GPU shared
2323
# memory per block to avoid gpu kernel launch failure.
24-
PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1
24+
PARAMS autoscheduler.last_level_cache_size=2000 autoscheduler.experimental_gpu_schedule=1
2525
)
2626

2727
# Main executable

src/autoschedulers/mullapudi2016/AutoSchedule.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1430,15 +1430,15 @@ class GPUTilingDedup {
14301430
}
14311431

14321432
split_info new_entry{entry};
1433-
new_entry.factor = simplify(min(threads_budget, new_entry.factor));
1433+
new_entry.factor = 1;
14341434

14351435
const bool can_split = helper.try_split(new_entry);
14361436
if (!can_split) {
14371437
// If more than 3 gpu_blocks are defined, mark the current loop as the for-loop.
14381438
parallelize.erase(iter);
14391439
continue;
14401440
}
1441-
threads_budget = simplify(max(threads_budget / new_entry.factor, 1));
1441+
threads_budget = simplify(max(threads_budget / entry.factor, 1));
14421442
}
14431443

14441444
helper.commit(sched, is_compute_at);

0 commit comments

Comments
 (0)