Parallelize by a factor of 1

antonysigma · antonysigma · commit a1aa557c16c2 · 2025-07-11T07:44:29.000-07:00
CPU's threading model is distinct from GPU's thread group model: GPU
shared memory is not shared beyond one GPU thread group.

Whenever nested parallelism is enabled in the Mullapudi2016
auto-scheduler, always implement parallelizable loop dimensions as
`gpu_block`. This can be implemented by splitting the dimensions by a
factor 1: `f.split(z, zi, zo, 1)`.

This makes the autoscheduler's `last_level_cache` estimates per GPU warp
more robust against variations of the nested parallelism.

In the folder `*/apps/`, remove all manual override of
`last_level_cache_size`. Use the default estimate: 47kB per thread
group.
diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(bgu FROM bgu.generator)
 add_halide_library(bgu_auto_schedule FROM bgu.generator
                    GENERATOR bgu
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(bgu_filter filter.cpp)
diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(harris FROM harris.generator)
 add_halide_library(harris_auto_schedule FROM harris.generator
                    GENERATOR harris
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=20000 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(harris_filter filter.cpp)
diff --git a/apps/iir_blur/CMakeLists.txt b/apps/iir_blur/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(iir_blur FROM iir_blur.generator)
 add_halide_library(iir_blur_auto_schedule FROM iir_blur.generator
                    GENERATOR iir_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(iir_blur_filter filter.cpp)
diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(lens_blur FROM lens_blur.generator)
 add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
                    GENERATOR lens_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(lens_blur_filter process.cpp)
diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
@@ -21,7 +21,7 @@ add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
                    AUTOSCHEDULER Halide::Mullapudi2016
                    # When target=host-cuda or host-metal, limit the GPU shared
                    # memory per block to avoid gpu kernel launch failure.
-                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1
+                   PARAMS autoscheduler.last_level_cache_size=2000 autoscheduler.experimental_gpu_schedule=1
                    )
 
 # Main executable
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -1430,15 +1430,15 @@ class GPUTilingDedup {
             }
 
             split_info new_entry{entry};
-            new_entry.factor = simplify(min(threads_budget, new_entry.factor));
+            new_entry.factor = 1;
 
             const bool can_split = helper.try_split(new_entry);
             if (!can_split) {
                 // If more than 3 gpu_blocks are defined, mark the current loop as the for-loop.
                 parallelize.erase(iter);
                 continue;
             }
-            threads_budget = simplify(max(threads_budget / new_entry.factor, 1));
+            threads_budget = simplify(max(threads_budget / entry.factor, 1));
         }
 
         helper.commit(sched, is_compute_at);

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator`
`21`	`21`	`AUTOSCHEDULER Halide::Mullapudi2016`
`22`	`22`	`# When target=host-cuda or host-metal, limit the GPU shared`
`23`	`23`	`# memory per block to avoid gpu kernel launch failure.`
`24`		`- PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1`
	`24`	`+ PARAMS autoscheduler.last_level_cache_size=2000 autoscheduler.experimental_gpu_schedule=1`
`25`	`25`	`)`
`26`	`26`
`27`	`27`	`# Main executable`
Original file line number	Diff line number	Diff line change
`@@ -1430,15 +1430,15 @@ class GPUTilingDedup {`
`1430`	`1430`	`}`
`1431`	`1431`
`1432`	`1432`	`split_info new_entry{entry};`
`1433`		`- new_entry.factor = simplify(min(threads_budget, new_entry.factor));`
	`1433`	`+ new_entry.factor = 1;`
`1434`	`1434`
`1435`	`1435`	`const bool can_split = helper.try_split(new_entry);`
`1436`	`1436`	`if (!can_split) {`
`1437`	`1437`	`// If more than 3 gpu_blocks are defined, mark the current loop as the for-loop.`
`1438`	`1438`	`parallelize.erase(iter);`
`1439`	`1439`	`continue;`
`1440`	`1440`	`}`
`1441`		`- threads_budget = simplify(max(threads_budget / new_entry.factor, 1));`
	`1441`	`+ threads_budget = simplify(max(threads_budget / entry.factor, 1));`
`1442`	`1442`	`}`
`1443`	`1443`
`1444`	`1444`	`helper.commit(sched, is_compute_at);`