Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support openmp directive: Version 1 #263

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions examples/c/openmp/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include <stdlib.h>
#include<stdio.h>
// #include <openacc.h>
#include<omp.h>
#include <time.h>


#define VECTOR_SIZE 1024*1024*1024

void vec_add(float *a, float *b, float *c) {
printf("Let's go...\n");

#pragma omp target enter data map(to: a[0:VECTOR_SIZE])
#pragma omp target enter data map(to: b[0:VECTOR_SIZE])
#pragma omp target enter data map(to: c[0:VECTOR_SIZE])

clock_t start = clock();

#pragma omp target
{
#pragma omp parallel for
for (unsigned int i = 0; i < VECTOR_SIZE; i++ ) {
c[i] = a[i] + b[i];

}

#pragma omp parallel for
for (unsigned int i = VECTOR_SIZE - 1; i > 0; i-- ) {
b[i] = a[i] + b[i];
}
}

// #pragma omp parallel for
// for (unsigned int i = vec_size - 1; i > 0; i-- ) {
// a[i] = a[i] + b[i];
// }

clock_t end = clock();
float time_used = (float)(end - start) / CLOCKS_PER_SEC;
printf("time = %f\n", time_used);

#pragma omp target exit data map(from: a[0:VECTOR_SIZE])
#pragma omp target exit data map(from: b[0:VECTOR_SIZE])
#pragma omp target exit data map(from: c[0:VECTOR_SIZE])

printf("method done..\n");
}

int main(void) {
float * a = (float *) malloc(VECTOR_SIZE * sizeof(float));
float * b = (float *) malloc(VECTOR_SIZE * sizeof(float));
float * c = (float *) malloc(VECTOR_SIZE * sizeof(float));

vec_add(a, b, c);

printf("Done...\n");

free(a);
free(b);
free(c);
}
44 changes: 44 additions & 0 deletions examples/c/openmp/main2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include <stdlib.h>
#include <omp.h>
#define VECTOR_SIZE 100000000

#include <chrono>
extern "C" float vector_add(float * restrict a, float * restrict b, float * restrict c, int size){
#pragma omp target enter data map(to: a[0:100000000])
#pragma omp target enter data map(to: b[0:100000000])
#pragma omp target enter data map(to: c[0:100000000])
auto kt_timing_start = std::chrono::steady_clock::now();
#pragma omp target parallel for num_threads(nthreads)
for ( int i = 0; i < size; i++ ) {
c[i] = a[i] + b[i];
}

auto kt_timing_end = std::chrono::steady_clock::now();
std::chrono::duration<float, std::milli> elapsed_time = kt_timing_end - kt_timing_start;
#pragma omp target exit data map(from: a[0:100000000])
#pragma omp target exit data map(from: b[0:100000000])
#pragma omp target exit data map(from: c[0:100000000])

return elapsed_time.count();

}

extern "C" float vector_add(float * restrict a, float * restrict b, float * restrict c, int size);

int main() {
float a[VECTOR_SIZE];
float b[VECTOR_SIZE];
float c[VECTOR_SIZE];

for (int i =0; i < VECTOR_SIZE; i++) {
a[i] = b[i] = c[i] = 0;
}

vector_add(a, b, c, VECTOR_SIZE);

// std::cout<< "Done...\n";

free(a);
free(b);
free(c);
}
Binary file added examples/c/openmp/main2.o
Binary file not shown.
28 changes: 15 additions & 13 deletions examples/directives/vector_add_c_openacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import (
Code,
DirectiveCode,
OpenACC,
Cxx,
extract_directive_signature,
Expand Down Expand Up @@ -40,7 +40,7 @@
"""

# Extract tunable directive
app = Code(OpenACC(), Cxx())
app = DirectiveCode(OpenACC(), Cxx())
preprocessor = extract_preprocessor(code)
signature = extract_directive_signature(code, app)
body = extract_directive_code(code, app)
Expand All @@ -59,14 +59,16 @@

answer = [None, None, args[0] + args[1], None]

tune_kernel(
"vector_add",
kernel_string,
0,
args,
tune_params,
metrics=metrics,
answer=answer,
compiler_options=["-fast", "-acc=gpu"],
compiler="nvc++",
)
print(kernel_string)

# tune_kernel(
# "vector_add",
# kernel_string,
# 0,
# args,
# tune_params,
# metrics=metrics,
# answer=answer,
# compiler_options=["-fast", "-acc=gpu"],
# compiler="nvc++",
# )
69 changes: 69 additions & 0 deletions examples/directives/vector_add_c_openmp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python
"""This is a simple example for tuning C++ OpenACC code with the kernel tuner"""

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import (
DirectiveCode,
OpenMP,
Cxx,
extract_directive_signature,
extract_directive_code,
extract_preprocessor,
generate_directive_function,
extract_directive_data,
allocate_signature_memory,
)

code = """
#include <stdlib.h>
#include <omp.h>

#define VECTOR_SIZE 100000000

void vector_add(float *a, float *b, float *c) {
#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)
#pragma omp parallel for num_threads(nthreads)
for ( int i = 0; i < size; i++ ) {
c[i] = a[i] + b[i];
}
#pragma tuner stop
}
"""

# Extract tunable directive
app = DirectiveCode(OpenMP(), Cxx())
preprocessor = extract_preprocessor(code)
signature = extract_directive_signature(code, app)
body = extract_directive_code(code, app)
# Allocate memory on the host
data = extract_directive_data(code, app)
args = allocate_signature_memory(data["vector_add"], preprocessor)
# Generate kernel string
kernel_string = generate_directive_function(
preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"]
)

tune_params = dict()
tune_params["nthreads"] = [16, 32]
metrics = dict()
metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9

# answer = [None, None, args[0] + args[1], None]

# print(preprocessor)
# print(signature)
# print(body)
# print(data)
# print(args)
print(kernel_string)

tune_kernel(
"vector_add",
kernel_string,
0,
args,
tune_params,
metrics=metrics,
compiler_options=["-fopenmp", "-mp=gpu"],
compiler="nvc++",
)
4 changes: 2 additions & 2 deletions examples/directives/vector_add_fortran_openacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import (
Code,
DirectiveCode,
OpenACC,
Fortran,
extract_directive_signature,
Expand Down Expand Up @@ -35,7 +35,7 @@
"""

# Extract tunable directive
app = Code(OpenACC(), Fortran())
app = DirectiveCode(OpenACC(), Fortran())
preprocessor = extract_preprocessor(code)
signature = extract_directive_signature(code, app)
body = extract_directive_code(code, app)
Expand Down
Loading