workshop exercises and slides

ANU-HPC · Jul 2, 2024 · 6f0ba1c · 6f0ba1c
1 parent 79d1e1c
commit 6f0ba1c
Show file tree

Hide file tree

Showing 16 changed files with 681 additions and 1 deletion.
diff --git a/DeviceAgnosticChapel.pdf b/DeviceAgnosticChapel.pdf
diff --git a/Makefile b/Makefile
@@ -0,0 +1,23 @@
+
+ifdef DEBUG
+CHPL_FLAGS=-g
+else
+CHPL_FLAGS=--fast
+endif
+
+pi_cpu: pi.chpl
+	CHPL_LOCALE_MODEL=flat chpl $(CHPL_FLAGS) $< -o $@
+
+pi_gpu: pi.chpl
+	CHPL_LOCALE_MODEL=gpu CHPL_GPU=nvidia chpl $(CHPL_FLAGS) $< -o $@
+
+heat_cpu: heat.chpl
+	CHPL_LOCALE_MODEL=flat chpl $(CHPL_FLAGS) $< -o $@
+
+heat_gpu: heat.chpl
+	CHPL_LOCALE_MODEL=gpu CHPL_GPU=nvidia chpl $(CHPL_FLAGS) $< -o $@
+
+.PHONY: clean
+
+clean:
+	rm pi_cpu pi_gpu heat_cpu
diff --git a/README.md b/README.md
@@ -1,3 +1,107 @@
 # Device-Agnostic Programming with Chapel: Exercises
 
-This is a placeholder for the exercise repository for the Device Agnostic Programming with Chapel workshop, 3 July 2024. Please check back on 3 July for the exercises.
+## Getting Started
+
+We will use NCI's Intel-based Gadi system for all exercises. We will use both the Cascade-Lake-based `normal` nodes and the GPU-accelerated `gpuvolta` nodes.
+
+To set up your environment for Chapel development, run the following command:
+
+```
+source /scratch/vp91/chapel-2.1/setup.bash
+```
+
+If you use Visual Studio Code as your editor, you may wish to install the [Chapel Language Extension for VS Code](https://marketplace.visualstudio.com/items?itemName=chpl-hpe.chapel-vscode).
+
+
+## Numerical Computation of PI
+
+The file [pi.chpl](pi.chpl) contains a sequential code that numerically computes the integral of $`4/(1+x*x)`$ over the interval $`[0..1)`$, which should equal $\pi$. Review the code so you understand what it is doing. Note the line that controls the number of integration steps:
+
+```chapel
+config const num_steps = 100000000;
+```
+
+Build a CPU-only executable using `make pi_cpu`. Run the executable on the login nodes with small numbers of steps to see how increasing the number of steps improves the accuracy of integration, e.g.
+
+```
+./pi_cpu -snum_steps 4
+```
+
+### Parallel Pi
+
+As provided, the program computes $\pi$ in a sequential `for` loop. Modify the code so that it uses Chapel's features for [data-parallelism](https://chapel-lang.org/docs/language/spec/data-parallelism.html) to compute the integral.
+
+On the login nodes, you can test your changes using small numbers of threads by changing the number of [worker threads](https://chapel-lang.org/docs/usingchapel/tasks.html#controlling-the-number-of-threads) that the Chapel runtime creates. For example:
+
+```
+CHPL_RT_NUM_THREADS_PER_LOCALE=4 ./pi_cpu
+```
+
+Now run the CPU-only version on a Gadi Cascade Lake compute node using the provided jobscript:
+
+```
+qsub job_pi_cpu.sh
+```
+
+### GPU-Accelerated PI
+
+The [`CHPL_LOCALE_MODEL` environment variable](https://chapel-lang.org/docs/usingchapel/chplenv.html#readme-chplenv-chpl-locale-model) determines whether to compile for GPU, or CPU only. You can check the value of this environment variable in Chapel code using the [`ChplConfig` module](https://chapel-lang.org/docs/modules/standard/ChplConfig.html). For example, the following code sets the value of the `targetLoc` locale to be the first GPU sub-locale if compiling for GPUs; otherwise, it sets the value of `targetLoc` to be the current (CPU) locale.
+
+```chapel
+use ChplConfig;
+const targetLoc = if CHPL_LOCALE_MODEL == "gpu" then here.gpus[0] else here;
+```
+
+Modify `pi.chpl` so that it works on either CPU or GPU, depending on how it is compiled.
+
+Build the GPU version using `make pi_gpu`. What happens if you run it on the (CPU-only) login node?
+
+Run the GPU version on a Gadi GPU Volta compute node using the provided jobscript:
+
+```
+qsub job_pi_gpu.sh`
+```
+
+### Diagnostics and Profiling
+
+You may wonder: how does the Chapel code translate into kernel launches and data movement? Chapel provides a variety of [diagnostic utilities](https://chapel-lang.org/docs/technotes/gpu.html#diagnostics-and-utilities) to help count and trace kernel launches, data movement, and memory allocations - try adding these diagnostics to `pi.chpl`.
+
+How does performance compare with the CPU version? What factors might be contributing to the relative performance of each version? You may wish to conduct [GPU profiling using `nvprof` or Visual Profiler](https://docs.nvidia.com/cuda/profiler-users-guide/index.html) to better understand the performance of the GPU code.
+
+## Heat Equation Solver
+
+The file [heat.chpl](heat.chpl) contains a sequential code that numerically solves the 2D heat equation using an explicit finite difference discretization.
+
+### Parallel Heat
+
+Modify `heat.chpl` to parallelize the solver as much as possible, making sure that correctness (as measured by `Error (L2norm)`) is maintained.
+
+Once you are happy with your parallel solver, consider also parallelizing the initialization and solution check code.
+
+Run your parallel solver using the provided jobscript:
+
+```
+qsub job_heat_cpu.sh
+```
+
+### GPU-Accelerated Heat
+
+Modify `heat.chpl` so that it works on either CPU or GPU, depending on how it is compiled.
+
+Run your GPU solver using the provided jobscript:
+
+```
+qsub job_heat_gpu.sh
+```
+
+How does the performance compare to the CPU version? Can you use Chapel GPU diagnostics or profiling (e.g. `nvprof`) to understand and improve the performance of your code?
+
+### Inspecting the Generated Code
+
+If you are comfortable with reading [PTX code](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html), you can inspect the PTX that the Chapel compiler has created from your data-parallel loops. Add the compile option `--savec tmp` to the `CHPL_FLAGS` variable in the Makefile, to instruct the Chapel compiler to save all intermediate generated binary code to the directory `tmp`. You should find the generated PTX in the file `tmp/chpl__gpu.s`.
+
+In the PTX, each generated kernel is named after the file and line number of the Chapel code that it is generated from. For example, if your heat file contains a `forall` data-parallel loop on line 147, then the PTX should contain a generated kernel starting with the following line:
+
+```
+	// .globl	chpl_gpu_kernel_heat_line_147_ // -- Begin function chpl_gpu_kernel_heat_line_147_
+```
diff --git a/heat.chpl b/heat.chpl
@@ -0,0 +1,191 @@
+
+/*
+** PROGRAM: heat equation solve
+**
+** PURPOSE: This program will explore use of an explicit
+**          finite difference method to solve the heat
+**          equation under a method of manufactured solution (MMS)
+**          scheme. The solution has been set to be a simple 
+**          function based on exponentials and trig functions.
+**
+**          A finite difference scheme is used on a 1000x1000 cube.
+**          A total of 0.5 units of time are simulated.
+**
+**          The MMS solution has been adapted from
+**          G.W. Recktenwald (2011). Finite difference approximations
+**          to the Heat Equation. Portland State University.
+**
+**
+** USAGE:   Run with two arguments:
+**          First is the number of cells.
+**          Second is the number of timesteps.
+**
+**          For example, with 100x100 cells and 10 steps:
+**
+**          ./heat 100 10
+**
+**
+** This Chapel program is translated from a C version originally
+** written by Tom Deakin, Oct 2018
+**
+*/
+
+use Time;
+use Math;
+
+param LINE = "--------------------\n"; // A line for fancy output
+
+// Problem size, forms an nxn grid
+config const n = 1000;
+if n < 0 then halt("Error: n must be positive");
+
+// Number of timesteps
+config const nsteps = 10;
+if nsteps < 0 then halt("Error: nsteps must be positive");
+
+// Start the total program runtime timer
+const start = timeSinceEpoch().totalSeconds();
+
+//
+// Set problem definition
+//
+param alpha = 0.1;          // heat equation coefficient
+param length = 1000.0;      // physical size of domain: length x length square
+const dx = length / (n+1);  // physical size of each cell (+1 as don't simulate boundaries as they are given)
+const dt = 0.5 / nsteps;    // time interval (total time of 0.5s)
+
+// Stability requires that dt/(dx^2) <= 0.5,
+const r = alpha * dt / (dx*dx);
+
+// Print message detailing runtime configuration
+writef("\n");
+writef(" MMS heat equation\n\n");
+writef(LINE);
+writef("Problem input\n\n");
+writef(" Grid size: %i x %i\n", n, n);
+writef(" Cell width: %er\n", dx);
+writef(" Grid length: %dr x %dr\n", length, length);
+writef("\n");
+writef(" Alpha: %er\n", alpha);
+writef("\n");
+writef(" Steps: %i\n", nsteps);
+writef(" Total time: %er\n", dt*nsteps);
+writef(" Time step: %er\n", dt);
+writef(LINE);
+
+// Stability check
+writef("Stability\n\n");
+writef(" r value: %dr\n", r);
+if r > 0.5 then
+  writef(" Warning: unstable\n");
+writef(LINE);
+
+// Allocate two nxn grids
+const outerDom: domain(2) = {0..#n, 0..#n};
+var arr1: [outerDom] real;
+var arr2: [outerDom] real;
+ref u = arr1;
+ref u_tmp = arr2;
+
+// Set the initial value of the grid under the MMS scheme
+initialValue(n, dx, length, u);
+
+//
+// Run through timesteps under the explicit scheme
+//
+
+// Start the solve timer
+const tic = timeSinceEpoch().totalSeconds();
+
+for 0..#nsteps {
+  // Call the solve kernel
+  // Computes u_tmp at the next timestep
+  // given the value of u at the current timestep
+  solve(n, alpha, dx, dt, u, u_tmp);
+
+  // reference swap
+  u <=> u_tmp;
+}
+
+// Stop solve timer
+const toc = timeSinceEpoch().totalSeconds();
+
+//
+// Check the L2-norm of the computed solution
+// against the *known* solution from the MMS scheme
+//
+const norm = l2Norm(n, u, nsteps, dt, alpha, dx, length);
+
+// Stop total timer
+const stop = timeSinceEpoch().totalSeconds();
+
+// Print results
+writef("Results\n\n");
+writef("Error (L2norm): %er\n", norm);
+writef("Solve time (s): %dr\n", toc-tic);
+writef("Total time (s): %dr\n", stop-start);
+writef(LINE);
+
+// Sets the mesh to an initial value, determined by the MMS scheme
+proc initialValue(const n: int, const dx: real, const length: real, ref u: [?outerDom] real) {
+  var y = dx; // Physical y position
+  for j in 0..#n {
+    var x = dx; // Physical x position
+    for i in 0..#n {
+      u[i,j] = sin(Math.pi * x / length) * sin(Math.pi * y / length);
+      x += dx;
+    }
+    y += dx;
+  }
+}
+
+// Compute the next timestep, given the current timestep
+proc solve(const n: int, const alpha: real, const dx: real, const dt: real, const ref u: [?outerDom] real, ref u_tmp: [outerDom] real) {
+  // Finite difference constant multiplier
+  const r = alpha * dt / (dx*dx);
+  const r2 = 1.0 - 4.0*r;
+
+  // Loop over the nxn grid
+  for (i,j) in outerDom {
+    // Update the 5-point stencil, using boundary conditions on the edges of the domain.
+    // Boundaries are zero because the MMS solution is zero there.
+    u_tmp[i,j] =  r2 * u[i,j] +
+    r * (if i < n-1 then u[i+1,j] else 0.0) +
+    r * (if i > 0   then u[i-1,j] else 0.0) +
+    r * (if j < n-1 then u[i,j+1] else 0.0) +
+    r * (if j > 0   then u[i,j-1] else 0.0);
+  }
+}
+
+// True answer given by the manufactured solution
+proc solution(const t: real, const x: real, const y: real, const alpha: real, const length: real) {
+  return exp(-2.0*alpha*Math.pi*Math.pi*t/(length*length)) * sin(Math.pi*x/length) * sin(Math.pi*y/length);
+}
+
+// Computes the L2-norm of the computed grid and the MMS known solution
+// The known solution is the same as the boundary function.
+proc l2Norm(const n: int, const ref u: [?outerDom] real, const nsteps: int, 
+  const dt: real, const alpha: real, const dx: real, const length: real) {
+
+  // Final (real) time simulated
+  const time = dt * nsteps;
+
+  // L2-norm error
+  var l2norm = 0.0;
+
+  // Loop over the grid and compute difference of computed and known solutions as an L2-norm
+  var y = dx;
+  for j in 0..#n {
+    var x = dx;
+    for i in 0..#n {
+      const answer = solution(time, x, y, alpha, length);
+      const err = u[i,j] - answer;
+      l2norm += err*err;
+
+      x += dx;
+    }
+    y += dx;
+  }
+
+  return sqrt(l2norm);
+}
diff --git a/job_heat_cpu.sh b/job_heat_cpu.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+#PBS -P vp91
+#PBS -q normal
+#PBS -l ncpus=48
+#PBS -l mem=100GB
+#PBS -l walltime=00:02:00
+#PBS -l wd
+
+./heat_cpu -sn=8000 -snsteps 20
diff --git a/job_heat_gpu.sh b/job_heat_gpu.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+#PBS -P vp91
+#PBS -q gpuvolta
+#PBS -l ncpus=12
+#PBS -l ngpus=1
+#PBS -l mem=100GB
+#PBS -l walltime=00:02:00
+#PBS -l wd
+
+module load cuda/12.3.2
+
+./heat_gpu -sn=8000 -snsteps 20
diff --git a/job_pi_cpu.sh b/job_pi_cpu.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+#PBS -P vp91
+#PBS -q normal
+#PBS -l ncpus=48
+#PBS -l mem=100GB
+#PBS -l walltime=00:02:00
+#PBS -l wd
+
+./pi_cpu
diff --git a/job_pi_gpu.sh b/job_pi_gpu.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+#PBS -P vp91
+#PBS -q gpuvolta
+#PBS -l ncpus=12
+#PBS -l ngpus=1
+#PBS -l mem=100GB
+#PBS -l walltime=00:02:00
+#PBS -l wd
+
+module load cuda/12.3.2
+
+./pi_gpu