From 1f9da85a7c234cf8d80648cdfb2c12d35110ea68 Mon Sep 17 00:00:00 2001 From: Denis Kumar Date: Sun, 9 Jun 2024 22:19:03 +0200 Subject: [PATCH 1/2] Support openmp directive: Version 1 --- examples/c/openmp/main.c | 61 ++++++ examples/c/openmp/main2.cpp | 44 +++++ examples/c/openmp/main2.o | Bin 0 -> 11544 bytes examples/directives/vector_add_c_openacc.py | 28 +-- examples/directives/vector_add_c_openmp.py | 69 +++++++ .../directives/vector_add_fortran_openacc.py | 4 +- kernel_tuner/utils/directives.py | 178 ++++++++++++------ test/utils/test_directives.py | 28 +-- 8 files changed, 330 insertions(+), 82 deletions(-) create mode 100644 examples/c/openmp/main.c create mode 100644 examples/c/openmp/main2.cpp create mode 100644 examples/c/openmp/main2.o create mode 100644 examples/directives/vector_add_c_openmp.py diff --git a/examples/c/openmp/main.c b/examples/c/openmp/main.c new file mode 100644 index 000000000..6dc45bcb5 --- /dev/null +++ b/examples/c/openmp/main.c @@ -0,0 +1,61 @@ +#include +#include +// #include +#include +#include + + +#define VECTOR_SIZE 1024*1024*1024 + +void vec_add(float *a, float *b, float *c) { + printf("Let's go...\n"); + + #pragma omp target enter data map(to: a[0:VECTOR_SIZE]) + #pragma omp target enter data map(to: b[0:VECTOR_SIZE]) + #pragma omp target enter data map(to: c[0:VECTOR_SIZE]) + + clock_t start = clock(); + + #pragma omp target + { + #pragma omp parallel for + for (unsigned int i = 0; i < VECTOR_SIZE; i++ ) { + c[i] = a[i] + b[i]; + + } + + #pragma omp parallel for + for (unsigned int i = VECTOR_SIZE - 1; i > 0; i-- ) { + b[i] = a[i] + b[i]; + } + } + + // #pragma omp parallel for + // for (unsigned int i = vec_size - 1; i > 0; i-- ) { + // a[i] = a[i] + b[i]; + // } + + clock_t end = clock(); + float time_used = (float)(end - start) / CLOCKS_PER_SEC; + printf("time = %f\n", time_used); + + #pragma omp target exit data map(from: a[0:VECTOR_SIZE]) + #pragma omp target exit data map(from: b[0:VECTOR_SIZE]) + #pragma omp target exit data map(from: c[0:VECTOR_SIZE]) + + printf("method done..\n"); +} + +int main(void) { + float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); + float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); + float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); + + vec_add(a, b, c); + + printf("Done...\n"); + + free(a); + free(b); + free(c); +} \ No newline at end of file diff --git a/examples/c/openmp/main2.cpp b/examples/c/openmp/main2.cpp new file mode 100644 index 000000000..9d9c04c08 --- /dev/null +++ b/examples/c/openmp/main2.cpp @@ -0,0 +1,44 @@ +#include +#include +#define VECTOR_SIZE 100000000 + +#include +extern "C" float vector_add(float * restrict a, float * restrict b, float * restrict c, int size){ + #pragma omp target enter data map(to: a[0:100000000]) + #pragma omp target enter data map(to: b[0:100000000]) + #pragma omp target enter data map(to: c[0:100000000]) + auto kt_timing_start = std::chrono::steady_clock::now(); + #pragma omp target parallel for num_threads(nthreads) + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + + auto kt_timing_end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_time = kt_timing_end - kt_timing_start; + #pragma omp target exit data map(from: a[0:100000000]) + #pragma omp target exit data map(from: b[0:100000000]) + #pragma omp target exit data map(from: c[0:100000000]) + + return elapsed_time.count(); + +} + +extern "C" float vector_add(float * restrict a, float * restrict b, float * restrict c, int size); + +int main() { + float a[VECTOR_SIZE]; + float b[VECTOR_SIZE]; + float c[VECTOR_SIZE]; + + for (int i =0; i < VECTOR_SIZE; i++) { + a[i] = b[i] = c[i] = 0; + } + + vector_add(a, b, c, VECTOR_SIZE); + + // std::cout<< "Done...\n"; + + free(a); + free(b); + free(c); +} \ No newline at end of file diff --git a/examples/c/openmp/main2.o b/examples/c/openmp/main2.o new file mode 100644 index 0000000000000000000000000000000000000000..8b9b9b2655440ebcfd80de0bb1bcb440ed810071 GIT binary patch literal 11544 zcmdT~eQaCR6~F#SYDjV2hOH^ECBOrONt*)dr|4G!CsPt)o18?0+Nl(3fvM(XTB<8~5+gmY(^`TTS;swq+c@?qm_##-vbc#HX8X_C|Ga*3 zs>fbEeNFZF>FrGz*I*3nRPX|BFKa{NUezQXu$+P8!_MEMt;RXDS^k1m3?&Z2UbUOB zLoq}^4}}P@bl_@7Kuyo0R@|}p=zshCiJ$?iDUv#5F!$qWvBy3wj>_}HvT4oHLRY4GWrRxS5z zfy2Zb7Rx;=nqzIg;B_t6w= zh$R;RoNR@95A=wy!c)O7tq92HgH>mlyIa{H--~pLd5-(}lI3yFEC#ZEv*Jv&Cbh4ow3d+t3?pytDT@ z)yHccZ`Kcdcdh3S9yG~T_Q_h$FHxD?B!0gvny3gg1Cs~@mSBMn2*9FX-S3xB-AoYw zfT&|&Yv0B`-LjfY<)d-dLu7*3gadVsZ^w&T-q9=#+p@<%r_P98{vEosO%Wc7~h_3Uh>)A}% z)N>F!==%j{7*BU8hQ{NgsbL70;Yd8bL6_=aIw}`L68I;cUat?{I)IZsrwP^1e=PU@ z!zH}|yb7@#jb@8uW5t4QPLCVOTfo^vQRrEZO^8`=OFFNm#*RKcneNxq8`aeT z>*m`~=pwg~>lT5WGyrZ(f;~ehQZp+3X>ew`S8gYJZKLbzR`$IuYQyyc&ulabnc=*l z=SGv;$|ZEfI$h6|?Er$6gwtU%lb*ReHPSGkHlONg^X@UWq z%Fn+rnQS=y`64FnJVEH5FeaLVuD?0U!&u06Br_9BGjhw;dGN*#2; zv(bJ;UIU_zr|%oe+|Q@uw_ORJkMPvGtFeRU-3GRsL0)_VezkK2Gc=i>U|wb=J($@! zPlE+lF)lC##xx1DMR(D8?ZaGDh|59PD{@ftc-{$K1b^jQ9Y8&cN#Z&u%B4<7W%3gv zMrU^I+Rh$p-&^S*XtRa@4S+`qlgE4jbs zja&>pFZxYTgFLV9wU|zN6RpsEeXn;7%FO-?hDL{6ES&{Txpz=_rb3=emo5n$1{~kS zT*iG=Vmm_hIV^jFq5k0J8L#)TV5m13z`o6h13ig&>`tg#4`fdk(^AWQ9#Xrxu@%RKOQ#yUMi)LMquXZ`KaePDP zb?9+w^jTLoo0ls->#D|0yY%5L&V_t-1k`u>1YWq$CZD+IY!DEi2sXemGR`KSSaCK8 zh)?j!*LVBG$LaxSt&xblyVmusgVqZ0Ta3^-DltyntjPj?1@hBP+&awg5VLSzG8x0b zdk5iFI(j(X9|au8zH&r!nn5zczvCA_NecZ0uB4~MSYhoJvC!mH#@x!(Zq z2Rq#SAPSRk%5V7G?%rKnwhqI!V%@fIJ}<_1^o{2y$~s(9#&daNBrL+IzRk&KmmYCu z8qTNH**Vd=LUA|Vg&lYI^Dj4cTDRb{dXYEhS^~MCW;@A{N%m;TFwk#q3g*oPY&GuF znpf7D?w^Zz*Z=Z;izCL)(;JDqOB>xrz?mQEE1CrZw681qfAwzXBfMDXdSOz}6>_F- z6!5Bzbh}l^;~CHLch6Y#-3ItG4jl6<=6~LS<5QS%yx$_>G5>M+F}|CgXNcodp7Hw~ zI6iY2|4IY=kp}oCdd{K#OB{Mu(DMoLPdM;)2kx|UlLL3o>jeki?$C1~Dh9#=zFFAz zuOuAJ!fymTuWKCqPCc6(IOe6yzr6uoa^P1u_+NA2PJ7;LfL}m)0Cqe5c8deQ+@T+; zl5iQWq&gPzV*r$x_c*i4rPv9CxHEg?OpkHS6mcz(5QmRFHlb}u%+LAB)fOyq>| z6|@uAAV%$tRI;9aYIDi_oUaWk{9CBa`3v(`u>!QqDr`HqQ+qd+%+Gn;mlXavY9CVg zIgfi>;eUnN-&6RjIBB4tScPp*h3xzV;kG>#-!CZqJg=7&e$EG9Q}|cWxc^Z2%^he=Ob(X)x{#ylU1 z`R}lrBBS8EPCFHRh}uO3-%RZ{9aQ2pX{r)gui!Ci z4=K2GAYczFILH4J3eJNZQScZI{%ZwiKmSR=tHggw!3Fu#`lxNq30Y&jF1R2-h~bad zWxaw|i9e;_ye=aO9;5i*qu^D-pHOh#M#_+@rJu^I9GA60pf^vn?VjWP*6m}S(&+3LT z>w3p0h6|Z7?3Y7WOaS8dEC35d(+J-@u;tpanaS>|W6I|W1_8!SeY6Bx(BbcsIzS#> z$nnkUVpQRMq2k=W6e_qBF$SpIhRF?341)m(-xJ|Dd=Bvb#^)vC6c$jk2wru%5-qpg z_bI5M&U2NQ|K$Mk{QPIdb2L7M8N(c=cG(9#m@BB`f1l)ezN~=TT;lr(H~-CIG6uwc zLjU0!u^i+098i4%kT^o)Ph-PDD(^7#sTi`ap@6`zf=*JJ0`m?x`;UyPJqNS^4~aW?~M^dQh5u2xsAX2O35y&Rw3gL^F4L^{eYoPeohd(SVD}yR|0^? z=kigKXE`okrbKgF19|3q2r6#&pCtQvozZ_7XR7^A0mimK)usq0`9P0Zh5c9M{SNfF zjW0Ay#5sNlAa(qo0;cLj$=}fUSa*`j`#b1VG300YtCUWqDSidKZuYmi*pK&1)qZ^b zp-wucpk~o2IuMR11F}5xy#y7v@gp?;K9a}v!}}Y{vtQlDM%{CGxdpn1_7C!-Nvb^V hDPn<|gJ^(3 +#include + +#define VECTOR_SIZE 100000000 + +void vector_add(float *a, float *b, float *c) { + #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) + #pragma omp parallel for num_threads(nthreads) + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop +} +""" + +# Extract tunable directive +app = DirectiveCode(OpenMP(), Cxx()) +preprocessor = extract_preprocessor(code) +signature = extract_directive_signature(code, app) +body = extract_directive_code(code, app) +# Allocate memory on the host +data = extract_directive_data(code, app) +args = allocate_signature_memory(data["vector_add"], preprocessor) +# Generate kernel string +kernel_string = generate_directive_function( + preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"] +) + +tune_params = dict() +tune_params["nthreads"] = [16, 32] +metrics = dict() +metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9 + +# answer = [None, None, args[0] + args[1], None] + +# print(preprocessor) +# print(signature) +# print(body) +# print(data) +# print(args) +print(kernel_string) + +tune_kernel( + "vector_add", + kernel_string, + 0, + args, + tune_params, + metrics=metrics, + compiler_options=["-fopenmp", "-mp=gpu"], + compiler="nvc++", +) diff --git a/examples/directives/vector_add_fortran_openacc.py b/examples/directives/vector_add_fortran_openacc.py index 29e94646a..760519c0b 100644 --- a/examples/directives/vector_add_fortran_openacc.py +++ b/examples/directives/vector_add_fortran_openacc.py @@ -3,7 +3,7 @@ from kernel_tuner import tune_kernel from kernel_tuner.utils.directives import ( - Code, + DirectiveCode, OpenACC, Fortran, extract_directive_signature, @@ -35,7 +35,7 @@ """ # Extract tunable directive -app = Code(OpenACC(), Fortran()) +app = DirectiveCode(OpenACC(), Fortran()) preprocessor = extract_preprocessor(code) signature = extract_directive_signature(code, app) body = extract_directive_code(code, app) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 0dad09ca3..85b4181ea 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -24,6 +24,12 @@ class OpenACC(Directive): def get(self) -> str: return "openacc" + +class OpenMP(Directive): + """Class to represent OpenMP""" + + def get(self) -> str: + return "openmp" class Cxx(Language): @@ -40,7 +46,7 @@ def get(self) -> str: return "fortran" -class Code(object): +class DirectiveCode(object): """Class to represent the directive and host code of the application""" def __init__(self, directive: Directive, lang: Language): @@ -94,6 +100,9 @@ def is_openacc(directive: Directive) -> bool: """Check if a directive is OpenACC""" return isinstance(directive, OpenACC) +def is_openmp(directive: Directive) -> bool: + """Check if a directive is OpenMP""" + return isinstance(directive, OpenMP) def is_cxx(lang: Language) -> bool: """Check if language is C++""" @@ -142,25 +151,41 @@ def line_contains_openacc_parallel_directive_fortran(line: str) -> bool: """Check if a line of code contains a Fortran OpenACC parallel directive or not""" return line_contains(line, "!$acc parallel") +def line_contains_openmp_directive(line: str, lang: Language) -> bool: + """Check if line contains an OpenMp parallel directive or not""" + if is_cxx(lang): + return line_contains_openmp_directive_cxx(line) + elif is_fortran(lang): + # TODO + return False + return False + +def line_contains_openmp_directive_cxx(line: str) -> bool: + """Check if a line of code contains any of a C++ OpenMP directives or not""" + clasuses = ['teams', 'distribute', 'parallel', 'simd'] + return any(list(map(lambda x: line_contains(line, f'#pragma omp {x}'), clasuses))) def line_contains(line: str, target: str) -> bool: """Generic helper to check if a line contains the target""" return target in line -def openacc_directive_contains_clause(line: str, clauses: list) -> bool: +def directive_contains_clause(line: str, clauses: list) -> bool: """Check if an OpenACC directive contains one clause from a list""" for clause in clauses: if clause in line: return True return False - def openacc_directive_contains_data_clause(line: str) -> bool: """Check if an OpenACC directive contains one data clause""" data_clauses = ["copy", "copyin", "copyout", "create", "no_create", "present", "device_ptr", "attach"] - return openacc_directive_contains_clause(line, data_clauses) + return directive_contains_clause(line, data_clauses) +def openmp_directive_contains_data_clause(line: str) -> bool: + """Check if an OpenMP directive contains one data clause""" + data_clauses = ["target map", "target enter data", "target data map"] + return directive_contains_clause(line, data_clauses) def create_data_directive_openacc(name: str, size: ArraySize, lang: Language) -> str: """Create a data directive for a given language""" @@ -175,6 +200,9 @@ def create_data_directive_openacc_cxx(name: str, size: ArraySize) -> str: """Create C++ OpenACC code to allocate and copy data""" return f"#pragma acc enter data create({name}[:{size.get()}])\n#pragma acc update device({name}[:{size.get()}])\n" +def create_data_directive_openmp_cxx(name: str, size: ArraySize) -> str: + """Create C++ OpenMP code to allocate and copy data""" + return f"#pragma omp target enter data map(to: {name}[0:{size.get()}])\n" def create_data_directive_openacc_fortran(name: str, size: ArraySize) -> str: """Create Fortran OpenACC code to allocate and copy data""" @@ -200,6 +228,9 @@ def exit_data_directive_openacc_cxx(name: str, size: ArraySize) -> str: """Create C++ OpenACC code to copy back data""" return f"#pragma acc exit data copyout({name}[:{size.get()}])\n" +def exit_data_directive_openmp_cxx(name: str, size: ArraySize) -> str: + """Create C++ OpenMP code to copy back data""" + return f"#pragma omp target exit data map(from: {name}[0:{size.get()}])\n" def exit_data_directive_openacc_fortran(name: str, size: ArraySize) -> str: """Create Fortran OpenACC code to copy back data""" @@ -228,7 +259,7 @@ def find_size_in_preprocessor(dimension: str, preprocessor: list) -> int: return ret_size -def extract_code(start: str, stop: str, code: str, langs: Code, kernel_name: str = None) -> dict: +def extract_code(start: str, stop: str, code: str, directive_code: DirectiveCode, kernel_name: str = None) -> dict: """Extract an arbitrary section of code""" found_section = False sections = dict() @@ -250,9 +281,9 @@ def extract_code(start: str, stop: str, code: str, langs: Code, kernel_name: str if kernel_name is None or correct_kernel(kernel_name, line): found_section = True try: - if is_cxx(langs.language): + if is_cxx(directive_code.language): name = line.strip().split(" ")[3] - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): name = line.strip().split(" ")[2] except IndexError: name = f"init_{init_found}" @@ -336,49 +367,65 @@ def end_timing_cxx(code: str) -> str: return code + "\nreturn elapsed_time.count();\n" -def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, user_dimensions: dict = None) -> str: +def wrap_data(code: str, directive_code: DirectiveCode, data: dict, preprocessor: list = None, user_dimensions: dict = None) -> str: + if(is_openacc(directive_code.directive)): + return wrap_data_openacc(code, directive_code, data, preprocessor, user_dimensions) + elif(is_openmp(directive_code.directive)): + return wrap_data_openmp(code, directive_code, data, preprocessor, user_dimensions) + +def wrap_data_openacc(code: str, directive_code: DirectiveCode, data: dict, preprocessor: list = None, user_dimensions: dict = None) -> str: """Insert data directives before and after the timed code""" intro = str() outro = str() for name in data.keys(): if "*" in data[name][0]: size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions) - if is_openacc(langs.directive) and is_cxx(langs.language): + if is_cxx(directive_code.language): intro += create_data_directive_openacc_cxx(name, size) outro += exit_data_directive_openacc_cxx(name, size) - elif is_openacc(langs.directive) and is_fortran(langs.language): - if "," in data[name][1]: - # Multi dimensional - pass - else: - # One dimensional - intro += create_data_directive_openacc_fortran(name, size) - outro += exit_data_directive_openacc_fortran(name, size) + elif is_fortran(directive_code.language): + intro += create_data_directive_openacc_fortran(name, size) + outro += exit_data_directive_openacc_fortran(name, size) + return intro + code + outro + +def wrap_data_openmp(code: str, directive_code: DirectiveCode, data: dict, preprocessor: list = None, user_dimensions: dict = None) -> str: + """Insert data directives before and after the timed code""" + intro = str() + outro = str() + for name in data.keys(): + if "*" in data[name][0]: + size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions) + if is_cxx(directive_code.language): + intro += create_data_directive_openmp_cxx(name, size) + outro += exit_data_directive_openmp_cxx(name, size) + # elif is_fortran(langs.language): + # intro += create_data_directive_openacc_fortran(name, size) + # outro += exit_data_directive_openacc_fortran(name, size) return intro + code + outro -def extract_directive_code(code: str, langs: Code, kernel_name: str = None) -> dict: +def extract_directive_code(code: str, directive_code: DirectiveCode, kernel_name: str = None) -> dict: """Extract explicitly marked directive sections from code""" - if is_cxx(langs.language): + if is_cxx(directive_code.language): start_string = "#pragma tuner start" end_string = "#pragma tuner stop" - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): start_string = "!$tuner start" end_string = "!$tuner stop" - return extract_code(start_string, end_string, code, langs, kernel_name) + return extract_code(start_string, end_string, code, directive_code, kernel_name) -def extract_initialization_code(code: str, langs: Code) -> str: +def extract_initialization_code(code: str, directive_code: DirectiveCode) -> str: """Extract the initialization section from code""" - if is_cxx(langs.language): + if is_cxx(directive_code.language): start_string = "#pragma tuner initialize" end_string = "#pragma tuner stop" - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): start_string = "!$tuner initialize" end_string = "!$tuner stop" - init_code = extract_code(start_string, end_string, code, langs) + init_code = extract_code(start_string, end_string, code, directive_code) if len(init_code) >= 1: return "\n".join(init_code.values()) + "\n" else: @@ -403,12 +450,12 @@ def format_argument_fortran(p_type: str, p_size: int, p_name: str) -> str: return argument -def extract_directive_signature(code: str, langs: Code, kernel_name: str = None) -> dict: +def extract_directive_signature(code: str, directive_code: DirectiveCode, kernel_name: str = None) -> dict: """Extract the user defined signature for directive sections""" - if is_cxx(langs.language): + if is_cxx(directive_code.language): start_string = "#pragma tuner start" - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): start_string = "!$tuner start" signatures = dict() @@ -416,10 +463,10 @@ def extract_directive_signature(code: str, langs: Code, kernel_name: str = None) if start_string in line: if kernel_name is None or correct_kernel(kernel_name, line): tmp_string = line.strip().split(" ") - if is_cxx(langs.language): + if is_cxx(directive_code.language): name = tmp_string[3] tmp_string = tmp_string[4:] - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): name = tmp_string[2] tmp_string = tmp_string[3:] params = list() @@ -432,13 +479,13 @@ def extract_directive_signature(code: str, langs: Code, kernel_name: str = None) p_type = p_type.split(":")[0] if "*" in p_type: p_type = p_type.replace("*", " * restrict") - if is_cxx(langs.language): + if is_cxx(directive_code.language): params.append(f"{p_type} {p_name}") - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): params.append(p_name) - if is_cxx(langs.language): + if is_cxx(directive_code.language): signatures[name] = f"float {name}({', '.join(params)})" - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): signatures[ name ] = f"function {name}({', '.join(params)}) result(timing)\nuse iso_c_binding\nimplicit none\n" @@ -460,22 +507,22 @@ def extract_directive_signature(code: str, langs: Code, kernel_name: str = None) return signatures -def extract_directive_data(code: str, langs: Code, kernel_name: str = None) -> dict: +def extract_directive_data(code: str, directive_code: DirectiveCode, kernel_name: str = None) -> dict: """Extract the data used in the directive section""" - if is_cxx(langs.language): + if is_cxx(directive_code.language): start_string = "#pragma tuner start" - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): start_string = "!$tuner start" data = dict() for line in code.replace("\\\n", "").split("\n"): if start_string in line: if kernel_name is None or correct_kernel(kernel_name, line): - if is_cxx(langs.language): + if is_cxx(directive_code.language): name = line.strip().split(" ")[3] tmp_string = line.strip().split(" ")[4:] - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): name = line.strip().split(" ")[2] tmp_string = line.strip().split(" ")[3:] data[name] = dict() @@ -510,7 +557,7 @@ def generate_directive_function( preprocessor: list, signature: str, body: str, - langs: Code, + directive_code: DirectiveCode, data: dict = None, initialization: str = "", user_dimensions: dict = None, @@ -522,29 +569,29 @@ def generate_directive_function( # add user dimensions to preprocessor for key, value in user_dimensions.items(): code += f"#define {key} {value}\n" - if is_cxx(langs.language) and "#include " not in preprocessor: + if is_cxx(directive_code.language) and "#include " not in preprocessor: code += "\n#include \n" - if is_cxx(langs.language): + if is_cxx(directive_code.language): code += 'extern "C" ' + signature + "{\n" - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): code += "\nmodule kt\nuse iso_c_binding\ncontains\n" code += "\n" + signature if len(initialization) > 1: code += initialization + "\n" if data is not None: - body = add_present_openacc(body, langs, data, preprocessor, user_dimensions) - if is_cxx(langs.language): + body = process_data_on_device(body, directive_code, data, preprocessor, user_dimensions) + if is_cxx(directive_code.language): body = start_timing_cxx(body) if data is not None: - code += wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) + code += wrap_data(body + "\n", directive_code, data, preprocessor, user_dimensions) else: code += body code = end_timing_cxx(code) code += "\n}" - elif is_fortran(langs.language): - body = wrap_timing(body, langs.language) + elif is_fortran(directive_code.language): + body = wrap_timing(body, directive_code.language) if data is not None: - code += wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) + code += wrap_data(body + "\n", directive_code, data, preprocessor, user_dimensions) else: code += body name = signature.split(" ")[1].split("(")[0] @@ -606,13 +653,35 @@ def add_new_line(line: str) -> str: return line +def process_data_on_device( + code: str, directive_code: DirectiveCode, data: dict, preprocessor: list = None, user_dimensions: dict = None +) -> str: + if(is_openacc(directive_code.directive)): + return add_present_openacc(code, directive_code, data, preprocessor, user_dimensions) + elif(is_openmp(directive_code.directive)): + return add_target_openmp(code, directive_code, data, preprocessor, user_dimensions) + + +def add_target_openmp(code: str, directive_code: DirectiveCode, data: dict, preprocessor: list = None, user_dimensions: dict = None): + """Add the present clause to OpenACC directive""" + new_body = "" + for line in code.replace("\\\n", "").split("\n"): + if line_contains_openmp_directive(line, directive_code.language): + parts = line.split('#pragma omp') + new_line = f"{parts[0]}#pragma omp target {parts[1]}" + new_body += new_line + else: + new_body += line + new_body = add_new_line(new_body) + return new_body + def add_present_openacc( - code: str, langs: Code, data: dict, preprocessor: list = None, user_dimensions: dict = None + code: str, directive_code: DirectiveCode, data: dict, preprocessor: list = None, user_dimensions: dict = None ) -> str: """Add the present clause to OpenACC directive""" new_body = "" for line in code.replace("\\\n", "").split("\n"): - if not line_contains_openacc_parallel_directive(line, langs.language): + if not line_contains_openacc_parallel_directive(line, directive_code.language): new_body += line else: # The line contains an OpenACC directive @@ -625,9 +694,9 @@ def add_present_openacc( for name in data.keys(): if "*" in data[name][0]: size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions) - if is_cxx(langs.language): + if is_cxx(directive_code.language): present_clause += add_present_openacc_cxx(name, size) - elif is_fortran(langs.language): + elif is_fortran(directive_code.language): present_clause += add_present_openacc_fortran(name, size) new_body += new_line + present_clause.rstrip() + "\n" new_body = add_new_line(new_body) @@ -638,6 +707,9 @@ def add_present_openacc_cxx(name: str, size: ArraySize) -> str: """Create present clause for C++ OpenACC directive""" return f" present({name}[:{size.get()}]) " +def add_target_openmp_cxx(name: str, size: ArraySize) -> str: + """Create present clause for C++ OpenMP directive""" + return f" omp target enter data map(to: {name}[0:{size.get()}]) " def add_present_openacc_fortran(name: str, size: ArraySize) -> str: """Create present clause for Fortran OpenACC directive""" diff --git a/test/utils/test_directives.py b/test/utils/test_directives.py index 23cd3b395..4d646f3a3 100644 --- a/test/utils/test_directives.py +++ b/test/utils/test_directives.py @@ -99,8 +99,8 @@ def test_wrap_timing(): def test_wrap_data(): - acc_cxx = Code(OpenACC(), Cxx()) - acc_f90 = Code(OpenACC(), Fortran()) + acc_cxx = DirectiveCode(OpenACC(), Cxx()) + acc_f90 = DirectiveCode(OpenACC(), Fortran()) code_cxx = "// this is a comment\n" code_f90 = "! this is a comment\n" data = {"array": ["int*", "size"]} @@ -156,7 +156,7 @@ def test_extract_directive_code(): for ( int i = 0; i < size; i++ ) { c[i] = a[i] + b[i]; }""" - acc_cxx = Code(OpenACC(), Cxx()) + acc_cxx = DirectiveCode(OpenACC(), Cxx()) returns = extract_directive_code(code, acc_cxx) assert len(returns) == 2 assert expected_one in returns["initialize"] @@ -179,7 +179,7 @@ def test_extract_directive_code(): C(i) = A(i) + B(i) end do !$acc end parallel loop""" - returns = extract_directive_code(code, Code(OpenACC(), Fortran()), "vector_add") + returns = extract_directive_code(code, DirectiveCode(OpenACC(), Fortran()), "vector_add") assert len(returns) == 1 assert expected in returns["vector_add"] @@ -226,7 +226,7 @@ def test_extract_preprocessor(): def test_extract_directive_signature(): - acc_cxx = Code(OpenACC(), Cxx()) + acc_cxx = DirectiveCode(OpenACC(), Cxx()) code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) \n#pragma acc" signatures = extract_directive_signature(code, acc_cxx) assert len(signatures) == 1 @@ -243,13 +243,13 @@ def test_extract_directive_signature(): signatures = extract_directive_signature(code, acc_cxx, "vector_add_ext") assert len(signatures) == 0 code = "!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)\n!$acc" - signatures = extract_directive_signature(code, Code(OpenACC(), Fortran())) + signatures = extract_directive_signature(code, DirectiveCode(OpenACC(), Fortran())) assert len(signatures) == 1 assert "function vector_add(A, B, C, n)" in signatures["vector_add"] def test_extract_directive_data(): - acc_cxx = Code(OpenACC(), Cxx()) + acc_cxx = DirectiveCode(OpenACC(), Cxx()) code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)\n#pragma acc" data = extract_directive_data(code, acc_cxx) assert len(data) == 1 @@ -259,7 +259,7 @@ def test_extract_directive_data(): assert "VECTOR_SIZE" in data["vector_add"]["size"] data = extract_directive_data(code, acc_cxx, "vector_add_double") assert len(data) == 0 - acc_f90 = Code(OpenACC(), Fortran()) + acc_f90 = DirectiveCode(OpenACC(), Fortran()) code = "!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)\n!$acc" data = extract_directive_data(code, acc_f90) assert len(data) == 1 @@ -279,7 +279,7 @@ def test_extract_directive_data(): def test_allocate_signature_memory(): code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)\n#pragma acc" - data = extract_directive_data(code, Code(OpenACC(), Cxx())) + data = extract_directive_data(code, DirectiveCode(OpenACC(), Cxx())) args = allocate_signature_memory(data["vector_add"]) assert args[3] == 0 preprocessor = ["#define VECTOR_SIZE 1024\n"] @@ -299,7 +299,7 @@ def test_allocate_signature_memory(): code = ( "!$tuner start matrix_add A(float*:N_ROWS,N_COLS) B(float*:N_ROWS,N_COLS) nr(int:N_ROWS) nc(int:N_COLS)\n!$acc" ) - data = extract_directive_data(code, Code(OpenACC(), Fortran())) + data = extract_directive_data(code, DirectiveCode(OpenACC(), Fortran())) preprocessor = ["#define N_ROWS 128\n", "#define N_COLS 512\n"] args = allocate_signature_memory(data["matrix_add"], preprocessor) assert args[2] == 128 @@ -315,13 +315,13 @@ def test_allocate_signature_memory(): def test_extract_initialization_code(): code_cpp = "#pragma tuner initialize\nconst int value = 42;\n#pragma tuner stop\n" code_f90 = "!$tuner initialize\ninteger :: value\n!$tuner stop\n" - assert extract_initialization_code(code_cpp, Code(OpenACC(), Cxx())) == "const int value = 42;\n" - assert extract_initialization_code(code_f90, Code(OpenACC(), Fortran())) == "integer :: value\n" + assert extract_initialization_code(code_cpp, DirectiveCode(OpenACC(), Cxx())) == "const int value = 42;\n" + assert extract_initialization_code(code_f90, DirectiveCode(OpenACC(), Fortran())) == "integer :: value\n" def test_add_present_openacc(): - acc_cxx = Code(OpenACC(), Cxx()) - acc_f90 = Code(OpenACC(), Fortran()) + acc_cxx = DirectiveCode(OpenACC(), Cxx()) + acc_f90 = DirectiveCode(OpenACC(), Fortran()) code_cxx = "#pragma acc parallel num_gangs(32)\n#pragma acc\n" code_f90 = "!$acc parallel async num_workers(16)\n" data = {"array": ["int*", "size"]} From bd3c874944cdcd1ccb35303b658da00bc93d41a3 Mon Sep 17 00:00:00 2001 From: Denis Kumar Date: Fri, 5 Jul 2024 17:12:41 +0200 Subject: [PATCH 2/2] frist draft --- .../auto-generation/add_schedule_example.py | 39 +++ examples/auto-generation/new.py | 39 +++ .../auto-generation/schedule_chunk_example.py | 40 +++ examples/auto-generation/temp_n6ijqzaa | 193 ++++++++++++ examples/directives/vector_add_c_openacc.py | 48 ++- examples/directives/vector_add_c_openmp.py | 43 +-- kernel_tuner/generation/code/__init__.py | 0 kernel_tuner/generation/code/code.py | 29 ++ kernel_tuner/generation/code/context.py | 121 ++++++++ kernel_tuner/generation/code/line.py | 47 +++ kernel_tuner/generation/generation.py | 45 +++ kernel_tuner/generation/rules/__init__.py | 0 .../rules/add_chunk_size_to_schedule_rule.py | 39 +++ .../add_num_threads_and_distribute_rule.py | 48 +++ .../generation/rules/add_schedule_rule.py | 56 ++++ kernel_tuner/generation/rules/rule.py | 22 ++ kernel_tuner/generation/token/__init__.py | 0 kernel_tuner/generation/token/code_token.py | 275 ++++++++++++++++++ kernel_tuner/generation/token/pragma_token.py | 152 ++++++++++ kernel_tuner/generation/token/token.py | 149 ++++++++++ kernel_tuner/generation/tree/__init__.py | 0 kernel_tuner/generation/tree/tree.py | 128 ++++++++ kernel_tuner/generation/utils/util.py | 21 ++ kernel_tuner/interface.py | 99 ++++++- kernel_tuner/runners/sequential.py | 1 + kernel_tuner/util.py | 6 +- kernel_tuner/utils/directives.py | 13 + noxfile.py | 2 +- 28 files changed, 1584 insertions(+), 71 deletions(-) create mode 100644 examples/auto-generation/add_schedule_example.py create mode 100644 examples/auto-generation/new.py create mode 100644 examples/auto-generation/schedule_chunk_example.py create mode 100644 examples/auto-generation/temp_n6ijqzaa create mode 100644 kernel_tuner/generation/code/__init__.py create mode 100644 kernel_tuner/generation/code/code.py create mode 100644 kernel_tuner/generation/code/context.py create mode 100644 kernel_tuner/generation/code/line.py create mode 100644 kernel_tuner/generation/generation.py create mode 100644 kernel_tuner/generation/rules/__init__.py create mode 100644 kernel_tuner/generation/rules/add_chunk_size_to_schedule_rule.py create mode 100644 kernel_tuner/generation/rules/add_num_threads_and_distribute_rule.py create mode 100644 kernel_tuner/generation/rules/add_schedule_rule.py create mode 100644 kernel_tuner/generation/rules/rule.py create mode 100644 kernel_tuner/generation/token/__init__.py create mode 100644 kernel_tuner/generation/token/code_token.py create mode 100644 kernel_tuner/generation/token/pragma_token.py create mode 100644 kernel_tuner/generation/token/token.py create mode 100644 kernel_tuner/generation/tree/__init__.py create mode 100644 kernel_tuner/generation/tree/tree.py create mode 100644 kernel_tuner/generation/utils/util.py diff --git a/examples/auto-generation/add_schedule_example.py b/examples/auto-generation/add_schedule_example.py new file mode 100644 index 000000000..810b20010 --- /dev/null +++ b/examples/auto-generation/add_schedule_example.py @@ -0,0 +1,39 @@ +from kernel_tuner import tune_kernel +from kernel_tuner.interface import auto_tune_kernel +from kernel_tuner.utils.directives import ( + DirectiveCode, + OpenMP, + Cxx, +) + +code = """ +#include +#include + +#define VECTOR_SIZE 10000 + +void vector_add(float *a, float *b, float *c) { + #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) + #pragma omp target parallel for num_threads(nthreads) + for ( int i = 0; i < VECTOR_SIZE; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop +} +""" + +# Extract tunable directive +directive = DirectiveCode(OpenMP(), Cxx()) + +tune_params = dict() +tune_params["nthreads"] = [4, 8] + +auto_tune_kernel( + "vector_add", + code, + 0, + tune_params=tune_params, + compiler_options=["-fopenmp", "-mp=gpu"], + compiler="nvc++", + directive=directive +) diff --git a/examples/auto-generation/new.py b/examples/auto-generation/new.py new file mode 100644 index 000000000..de403cfc9 --- /dev/null +++ b/examples/auto-generation/new.py @@ -0,0 +1,39 @@ +from kernel_tuner import tune_kernel +from kernel_tuner.interface import auto_tune_kernel +from kernel_tuner.utils.directives import ( + DirectiveCode, + OpenMP, + Cxx, +) + +code = """ +#include +#include + +#define VECTOR_SIZE 100000000 + +void vector_add(float *a, float *b, float *c) { + #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) + #pragma omp target parallel for num_threads(nthreads) + for ( int i = 0; i < VECTOR_SIZE; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop +} +""" + +# Extract tunable directive +directive = DirectiveCode(OpenMP(), Cxx()) + +tune_params = dict() +tune_params["nthreads"] = [16, 32] + +auto_tune_kernel( + "vector_add", + code, + 0, + tune_params=tune_params, + compiler_options=["-fopenmp", "-mp=gpu"], + compiler="nvc++", + directive=directive +) diff --git a/examples/auto-generation/schedule_chunk_example.py b/examples/auto-generation/schedule_chunk_example.py new file mode 100644 index 000000000..4b5c2bd88 --- /dev/null +++ b/examples/auto-generation/schedule_chunk_example.py @@ -0,0 +1,40 @@ +from kernel_tuner import tune_kernel +from kernel_tuner.interface import auto_tune_kernel +from kernel_tuner.utils.directives import ( + DirectiveCode, + OpenMP, + Cxx, +) + +code = """ +#include +#include + +#define VECTOR_SIZE 10000 + +void vector_add(float *a, float *b, float *c) { + #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) + #pragma omp target parallel num_threads(nthreads) + #pragma omp for schedule(static) + for ( int i = 0; i < VECTOR_SIZE; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop +} +""" + +# Extract tunable directive +directive = DirectiveCode(OpenMP(), Cxx()) + +tune_params = dict() +tune_params["nthreads"] = [4, 8] + +auto_tune_kernel( + "vector_add", + code, + 0, + tune_params=tune_params, + compiler_options=["-fopenmp", "-mp=gpu"], + compiler="nvc++", + directive=directive +) diff --git a/examples/auto-generation/temp_n6ijqzaa b/examples/auto-generation/temp_n6ijqzaa new file mode 100644 index 000000000..412251d43 --- /dev/null +++ b/examples/auto-generation/temp_n6ijqzaa @@ -0,0 +1,193 @@ +==========CODE========== +#include +#include +#define VECTOR_SIZE 10000 + +#include +extern "C" float vector_add(float * restrict a, float * restrict b, float * restrict c, int size){ +#pragma omp target enter data map(to: a[0:10000]) +#pragma omp target enter data map(to: b[0:10000]) +#pragma omp target enter data map(to: c[0:10000]) +auto kt_timing_start = std::chrono::steady_clock::now(); +#pragma omp target parallel for num_threads(nthreads) +for ( int i = 0; i < VECTOR_SIZE; i++ ) { +c[i] = a[i] + b[i]; +} + +auto kt_timing_end = std::chrono::steady_clock::now(); +std::chrono::duration elapsed_time = kt_timing_end - kt_timing_start; +#pragma omp target exit data map(from: a[0:10000]) +#pragma omp target exit data map(from: b[0:10000]) +#pragma omp target exit data map(from: c[0:10000]) + +return elapsed_time.count(); + +} + +==========TREE========== + +id: 55 +type: TOKEN_TYPE.PRAGMA +level: 0 +pragma_type: PRAGMA_TOKEN_TYPE.ROOT +keywords: [] +META: {} +line_start: +children: [62, 95, 42, 98, 6, 100, 38] +content: + + +id: 62 +type: TOKEN_TYPE.PRAGMA +level: 1 +pragma_type: PRAGMA_TOKEN_TYPE.DATA_ENTER +keywords: ['DATA', 'MAP'] +META: {} +line_start: #pragma omp target enter data map(to: a[0:10000]) +children: [] +content: + enter data map(to: a[0:10000]) + +id: 95 +type: TOKEN_TYPE.PRAGMA +level: 1 +pragma_type: PRAGMA_TOKEN_TYPE.DATA_ENTER +keywords: ['DATA', 'MAP'] +META: {} +line_start: #pragma omp target enter data map(to: b[0:10000]) +children: [] +content: + enter data map(to: b[0:10000]) + +id: 42 +type: TOKEN_TYPE.PRAGMA +level: 1 +pragma_type: PRAGMA_TOKEN_TYPE.DATA_ENTER +keywords: ['DATA', 'MAP'] +META: {} +line_start: #pragma omp target enter data map(to: c[0:10000]) +children: [] +content: + enter data map(to: c[0:10000]) + +id: 98 +type: TOKEN_TYPE.PRAGMA +level: 1 +pragma_type: PRAGMA_TOKEN_TYPE.PARALLEL +keywords: ['PARALLEL', 'FOR', 'NUM_THREADS'] +META: {: 'nthreads'} +line_start: #pragma omp target parallel for num_threads(nthreads) +children: [28] +content: + #pragma omp target parallel for num_threads(nthreads) +for ( int i = 0; i < VECTOR_SIZE; i++ ) { +c[i] = a[i] + b[i]; +} + +id: 28 +type: TOKEN_TYPE.FOR +line_start: for ( int i = 0; i < VECTOR_SIZE; i++ ) { +children: [] +content: + for ( int i = 0; i < VECTOR_SIZE; i++ ) { +c[i] = a[i] + b[i]; +} + +id: 6 +type: TOKEN_TYPE.PRAGMA +level: 1 +pragma_type: PRAGMA_TOKEN_TYPE.DATA_EXIT +keywords: ['DATA', 'MAP'] +META: {} +line_start: #pragma omp target exit data map(from: a[0:10000]) +children: [] +content: + exit data map(from: a[0:10000]) + +id: 100 +type: TOKEN_TYPE.PRAGMA +level: 1 +pragma_type: PRAGMA_TOKEN_TYPE.DATA_EXIT +keywords: ['DATA', 'MAP'] +META: {} +line_start: #pragma omp target exit data map(from: b[0:10000]) +children: [] +content: + exit data map(from: b[0:10000]) + +id: 38 +type: TOKEN_TYPE.PRAGMA +level: 1 +pragma_type: PRAGMA_TOKEN_TYPE.DATA_EXIT +keywords: ['DATA', 'MAP'] +META: {} +line_start: #pragma omp target exit data map(from: c[0:10000]) +children: [] +content: + exit data map(from: c[0:10000]) + +==========RULES========== + +==========LOOP========== + +==========CODE========== +#include +#include +#define VECTOR_SIZE 10000 + +#include +extern "C" float vector_add(float * restrict a, float * restrict b, float * restrict c, int size){ +#pragma omp target enter data map(to: a[0:10000]) +#pragma omp target enter data map(to: b[0:10000]) +#pragma omp target enter data map(to: c[0:10000]) +auto kt_timing_start = std::chrono::steady_clock::now(); +#pragma omp target parallel for num_threads(nthreads) +for ( int i = 0; i < VECTOR_SIZE; i++ ) { +c[i] = a[i] + b[i]; +} + +auto kt_timing_end = std::chrono::steady_clock::now(); +std::chrono::duration elapsed_time = kt_timing_end - kt_timing_start; +#pragma omp target exit data map(from: a[0:10000]) +#pragma omp target exit data map(from: b[0:10000]) +#pragma omp target exit data map(from: c[0:10000]) + +return elapsed_time.count(); + +} + + +'{'nthreads': [4, 8]} + +'[{'nthreads': 4, 'time': 0.08165500100169863, 'times': [0.1260250061750412, 0.07672300189733505, 0.07780499756336212, 0.07726400345563889, 0.07372699677944183, 0.07005099952220917, 0.06999000161886215], 'compile_time': 1799.757830798626, 'verification_time': 0, 'benchmark_time': 0.9146742522716522, 'strategy_time': 0, 'framework_time': 0, 'timestamp': '2024-07-04 21:50:02.637776+00:00'}, {'nthreads': 8, 'time': 0.14048899869833673, 'times': [0.572746992111206, 0.06971000134944916, 0.064860999584198, 0.060513000935316086, 0.06010200083255768, 0.06553199887275696, 0.08995799720287323], 'compile_time': 1737.5205904245377, 'verification_time': 0, 'benchmark_time': 1.3068746775388718, 'strategy_time': 0, 'framework_time': 0.36424025893211365, 'timestamp': '2024-07-04 21:50:04.377010+00:00'}] +==========CODE========== +#include +#include +#define VECTOR_SIZE 10000 + +#include +extern "C" float vector_add(float * restrict a, float * restrict b, float * restrict c, int size){ +#pragma omp target enter data map(to: a[0:10000]) +#pragma omp target enter data map(to: b[0:10000]) +#pragma omp target enter data map(to: c[0:10000]) +auto kt_timing_start = std::chrono::steady_clock::now(); +#pragma omp target parallel num_threads (nthreads) +#pragma omp for schedule (scedule_type) +for ( int i = 0; i < VECTOR_SIZE; i++ ) { +c[i] = a[i] + b[i]; +} + +auto kt_timing_end = std::chrono::steady_clock::now(); +std::chrono::duration elapsed_time = kt_timing_end - kt_timing_start; +#pragma omp target exit data map(from: a[0:10000]) +#pragma omp target exit data map(from: b[0:10000]) +#pragma omp target exit data map(from: c[0:10000]) + +return elapsed_time.count(); + +} + + +'{'nthreads': [4, 8], 'scedule_type': ['static']} + +'[{'nthreads': 4, 'scedule_type': 'static', 'time': 0.08587557183844703, 'times': [0.14716500043869019, 0.06352800130844116, 0.0857589989900589, 0.07673300057649612, 0.08159200102090836, 0.06972000002861023, 0.07663200050592422], 'compile_time': 1957.1005813777447, 'verification_time': 0, 'benchmark_time': 0.9234901517629623, 'strategy_time': 0, 'framework_time': 0, 'timestamp': '2024-07-04 21:50:08.375552+00:00'}, {'nthreads': 8, 'scedule_type': 'static', 'time': 0.10097399832946914, 'times': [0.3191649913787842, 0.04559500142931938, 0.08122099936008453, 0.06004200130701065, 0.07753399759531021, 0.05485299974679947, 0.06840799748897552], 'compile_time': 1985.8440440148115, 'verification_time': 0, 'benchmark_time': 1.0134782642126083, 'strategy_time': 0, 'framework_time': 0.32549723982810974, 'timestamp': '2024-07-04 21:50:10.362772+00:00'}] diff --git a/examples/directives/vector_add_c_openacc.py b/examples/directives/vector_add_c_openacc.py index 9892c5c2e..ebbfb4acb 100644 --- a/examples/directives/vector_add_c_openacc.py +++ b/examples/directives/vector_add_c_openacc.py @@ -2,16 +2,12 @@ """This is a simple example for tuning C++ OpenACC code with the kernel tuner""" from kernel_tuner import tune_kernel +from kernel_tuner.interface import auto_tune_kernel from kernel_tuner.utils.directives import ( DirectiveCode, OpenACC, Cxx, - extract_directive_signature, - extract_directive_code, - extract_preprocessor, - generate_directive_function, - extract_directive_data, - allocate_signature_memory, + preprocess_directive_source, ) code = """ @@ -40,35 +36,25 @@ """ # Extract tunable directive -app = DirectiveCode(OpenACC(), Cxx()) -preprocessor = extract_preprocessor(code) -signature = extract_directive_signature(code, app) -body = extract_directive_code(code, app) -# Allocate memory on the host -data = extract_directive_data(code, app) -args = allocate_signature_memory(data["vector_add"], preprocessor) -# Generate kernel string -kernel_string = generate_directive_function( - preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"] -) +directive = DirectiveCode(OpenACC(), Cxx()) +kernel_source, args = preprocess_directive_source("vector_add", code, directive) +print(args) tune_params = dict() -tune_params["nthreads"] = [32 * i for i in range(1, 33)] +tune_params["nthreads"] = [32 * i for i in range(1, 3)] metrics = dict() metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9 answer = [None, None, args[0] + args[1], None] -print(kernel_string) - -# tune_kernel( -# "vector_add", -# kernel_string, -# 0, -# args, -# tune_params, -# metrics=metrics, -# answer=answer, -# compiler_options=["-fast", "-acc=gpu"], -# compiler="nvc++", -# ) +auto_tune_kernel( + "vector_add", + kernel_source, + 0, + tune_params, + arguments=args, + metrics=metrics, + answer=answer, + compiler_options=["-fast", "-acc=gpu"], + compiler="nvc++", +) diff --git a/examples/directives/vector_add_c_openmp.py b/examples/directives/vector_add_c_openmp.py index c1fa5473c..867aa5ad8 100644 --- a/examples/directives/vector_add_c_openmp.py +++ b/examples/directives/vector_add_c_openmp.py @@ -2,16 +2,11 @@ """This is a simple example for tuning C++ OpenACC code with the kernel tuner""" from kernel_tuner import tune_kernel +from kernel_tuner.interface import auto_tune_kernel from kernel_tuner.utils.directives import ( DirectiveCode, OpenMP, Cxx, - extract_directive_signature, - extract_directive_code, - extract_preprocessor, - generate_directive_function, - extract_directive_data, - allocate_signature_memory, ) code = """ @@ -22,8 +17,9 @@ void vector_add(float *a, float *b, float *c) { #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) + #pragma omp teams (nteams) #pragma omp parallel for num_threads(nthreads) - for ( int i = 0; i < size; i++ ) { + for ( int i = 0; i < VECTOR_SIZE; i++ ) { c[i] = a[i] + b[i]; } #pragma tuner stop @@ -31,39 +27,18 @@ """ # Extract tunable directive -app = DirectiveCode(OpenMP(), Cxx()) -preprocessor = extract_preprocessor(code) -signature = extract_directive_signature(code, app) -body = extract_directive_code(code, app) -# Allocate memory on the host -data = extract_directive_data(code, app) -args = allocate_signature_memory(data["vector_add"], preprocessor) -# Generate kernel string -kernel_string = generate_directive_function( - preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"] -) +directive = DirectiveCode(OpenMP(), Cxx()) tune_params = dict() tune_params["nthreads"] = [16, 32] -metrics = dict() -metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9 - -# answer = [None, None, args[0] + args[1], None] - -# print(preprocessor) -# print(signature) -# print(body) -# print(data) -# print(args) -print(kernel_string) +tune_params["nteams"] = [2, 4] -tune_kernel( +auto_tune_kernel( "vector_add", - kernel_string, + code, 0, - args, - tune_params, - metrics=metrics, + tune_params=tune_params, compiler_options=["-fopenmp", "-mp=gpu"], compiler="nvc++", + directive=directive ) diff --git a/kernel_tuner/generation/code/__init__.py b/kernel_tuner/generation/code/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kernel_tuner/generation/code/code.py b/kernel_tuner/generation/code/code.py new file mode 100644 index 000000000..3f91b14fb --- /dev/null +++ b/kernel_tuner/generation/code/code.py @@ -0,0 +1,29 @@ +from __future__ import annotations +from kernel_tuner.generation.code.line import Line + +class Code: + + def __init__(self, lines: list[str]) -> None: + self.initial_lines = list(map(lambda x: Line(x[1], x[0]), enumerate(lines.copy()))) + self.lines = list(map(lambda x: Line(x[1], x[0]), enumerate(lines))) + self.num_lines = len(self.lines) + + def print(self): + print(self.to_text()) + + def to_text(self) -> str: + return '\n'.join(list(map(lambda x: x.content, self.initial_lines))) + +class CodeBlock: + + def __init__(self, lines: list[Line]): + self.lines = lines + + def size(self) -> int: + return len(self.lines) + + def get(self, i) -> Line|None: + if i < self.size(): + return self.lines[i] + else: + return None \ No newline at end of file diff --git a/kernel_tuner/generation/code/context.py b/kernel_tuner/generation/code/context.py new file mode 100644 index 000000000..183d7bba0 --- /dev/null +++ b/kernel_tuner/generation/code/context.py @@ -0,0 +1,121 @@ +import copy +from kernel_tuner.generation.code.code import Code +from kernel_tuner.generation.code.line import Line +from kernel_tuner.generation.token.pragma_token import PragmaToken, PRAGMA_KEYWORDS +from kernel_tuner.generation.utils.util import PragmaTuneParams + +class Context: + + def __init__(self, initial_code: Code, initial_tune_params: PragmaTuneParams): + self.initial_code = initial_code + self.tune_param_names = set(map(lambda x: x[1], initial_tune_params)) + self.propositions: dict[str, tuple[Code, PragmaTuneParams]] = {} + + # def offer_with_new_line(self, old_line: Line, new_line:Line, rule_id: str): + # proposition_code = copy.deepcopy(self.initial_code) + # for idx, line in enumerate(proposition_code.initial_lines): + # if line.line_number == old_line.line_number: + # proposition_code.initial_lines[idx] = new_line + # break + # self.propositions.append(proposition_code) + + def __append( + self, + rule_id: str, + proposition_code: Code, + pragma_tune_params: PragmaTuneParams + ): + self.propositions[rule_id] = (proposition_code, pragma_tune_params) + self.tune_param_names.add(set(map(lambda x: x[1], pragma_tune_params))) + + def offer_with_new_lines( + self, + old_lines: list[Line], + new_lines: list[Line], + rule_id: str, + pragma_tune_params: PragmaTuneParams + ): + if len(old_lines) != len(new_lines): + return + proposition_code = copy.deepcopy(self.initial_code) + for idx, line in enumerate(proposition_code.initial_lines): + for old_idx, old in enumerate(old_lines): + if line.line_number == old.line_number: + proposition_code.initial_lines[idx] = new_lines[old_idx] + continue + self.__append(rule_id, proposition_code, pragma_tune_params) + + + def offer_with_new_token( + self, + old_tokens: list[PragmaToken], + new_tokens: list[PragmaToken], + rule_id: str, + pragma_tune_params: PragmaTuneParams + ): + self.offer_with_new_lines( + list(map(lambda x: x.initial_line, old_tokens)), + list(map(lambda x: x.initial_line, new_tokens)), + rule_id, + pragma_tune_params + ) + + + def offer_with_new_token_content( + self, + old_token: PragmaToken, + new_token: PragmaToken, + rule_id: str, + pragma_tune_params: PragmaTuneParams + ): + if rule_id in self.propositions: + proposition_code = self.propositions[rule_id][0] + else: + proposition_code = copy.deepcopy(self.initial_code) + for idx, line in enumerate(proposition_code.initial_lines): + if line.line_number == old_token.initial_line.line_number: + proposition_code.initial_lines = proposition_code.initial_lines[:idx] + new_token.content.lines + proposition_code.initial_lines[idx + len(old_token.content.lines):] + break + self.__append(rule_id, proposition_code, pragma_tune_params) + + def offer_with_add_pragma_above( + self, + old_token: PragmaToken, + new_token: PragmaToken, + rule_id: str, + pragma_tune_params: PragmaTuneParams + ): + if rule_id in self.propositions: + proposition_code = self.propositions[rule_id][0] + else: + proposition_code = copy.deepcopy(self.initial_code) + for idx, line in enumerate(proposition_code.initial_lines): + if line.line_number == old_token.initial_line.line_number: + proposition_code.initial_lines.insert(idx, new_token.initial_line) + break + self.__append(rule_id, proposition_code, pragma_tune_params) + + + def offer_with_line_token(self, old_line: Line, new_token: PragmaToken): + proposition_code = copy.deepcopy(self.initial_code) + for idx, line in enumerate(proposition_code.initial_lines): + if line.line_number == old_line.line_number: + proposition_code.initial_lines = proposition_code.initial_lines[:idx-1] + new_token.content.lines + proposition_code.initial_lines[idx:] + break + self.propositions.append(proposition_code) + + # TODO make it more beautiful + def get_tune_param_unique_name(self, prefix: str) -> str: + while prefix in self.tune_param_names: + prefix += '_A' + return prefix + + + def get(self, rule_id) -> tuple[Code, PragmaTuneParams]: + if rule_id in self.propositions: + return self.propositions[rule_id] + return None + + def print_propositions(self): + for proposition in self.propositions: + proposition.print() \ No newline at end of file diff --git a/kernel_tuner/generation/code/line.py b/kernel_tuner/generation/code/line.py new file mode 100644 index 000000000..c4cb33771 --- /dev/null +++ b/kernel_tuner/generation/code/line.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +class Line: + def __init__(self, line, line_number): + self.content = line.strip() + self.line_number = line_number + self.words = self.content.split() + + def replace(self, remove: str, append: str = ''): + self.content = self.content.replace(remove, append).strip() + + def is_open_brace(self) -> bool: + return self.content.startswith('{') or self.content.endswith('{') + + def is_close_brace(self) -> bool: + return self.content.startswith('}') or self.content.endswith('}') + + def startswith(self, start) -> bool: + return self.content.startswith(start) + + def endswith(self, start) -> bool: + return self.content.endswith(start) + + def append(self, line: Line, split: str): + self.content += split + line.content + self.content.strip() + + def find(self, finder): + if finder in self.content: + return self.content.find(finder) + else: + return None + + def len(self): + return len(self.content) + + def find_end_line(self): + return self.find(';') + + def __add__(self, line) -> Line: + self.content += ' ' + line.content + self.content.strip() + return self + +def merge_to_one_line(lines: list[Line]) -> str: + return ''.join(list(map(lambda x: x.content, lines))) + diff --git a/kernel_tuner/generation/generation.py b/kernel_tuner/generation/generation.py new file mode 100644 index 000000000..67f65cd99 --- /dev/null +++ b/kernel_tuner/generation/generation.py @@ -0,0 +1,45 @@ +from kernel_tuner.generation.code.code import Code +from kernel_tuner.generation.code.context import Context +from kernel_tuner.generation.tree.tree import TreeBuilder, Tree +from kernel_tuner.generation.rules.add_num_threads_and_distribute_rule import AddNumThreadsAndDistributeRule +from kernel_tuner.generation.rules.add_chunk_size_to_schedule_rule import AddChunkSizeToScheduleRule +from kernel_tuner.generation.rules.add_schedule_rule import AddStaticScheduleRule +from kernel_tuner.generation.token.pragma_token import PragmaToken, PRAGMA_KEYWORDS +from kernel_tuner.util import write_file +from kernel_tuner.generation.utils.util import PragmaTuneParams, convertPragmaTuneToDict + +def generate_kernel_sources(initial_code_str: str, initilea_tune_params: dict, debug_file=None): + print(initilea_tune_params) + code = Code(initial_code_str.split('\n')) + if debug_file: + write_file(debug_file, '='*10 + 'CODE' + '='*10 + '\n' + code.to_text() + '\n\n', "a") + tree_builder = TreeBuilder(code) + tree = tree_builder.build_tree() + if debug_file: + write_file(debug_file, '='*10 + 'TREE' + '='*10 + '\n\n', "a") + tree.dfs_print(debug_file_name=debug_file) + + result_tune_param = [] + tree.dfs(convert_pragma_keyword_to_initiail_tune_params, initilea_tune_params, result_tune_param) + context = Context(code, result_tune_param) + write_file(debug_file, '='*10 + 'RULES' + '='*10 + '\n\n', "a") + + #s_rule = AddNumThreadsAndDistributeRule(tree, store, result_tune_param) + # s_rule = AddChunkSizeToScheduleRule(tree, store, result_tune_param) + s_rule = AddStaticScheduleRule(tree, context, result_tune_param) + s_rule.run(debug_file) + result = [(code, result_tune_param)] + [context.get(s_rule.rule_id)] + return post_process(result) + + +def convert_pragma_keyword_to_initiail_tune_params( + node: PragmaToken, + initial_tune_params: dict[str, str], + result_tune_param: PragmaTuneParams + ): + for (pragma_keyword, param_name) in node.meta.items(): + if param_name in initial_tune_params: + result_tune_param.append((pragma_keyword, param_name, initial_tune_params[param_name])) + +def post_process(result: list[tuple[Code, PragmaTuneParams]]): + return list(map(lambda x: (x[0], convertPragmaTuneToDict(x[1])), result)) \ No newline at end of file diff --git a/kernel_tuner/generation/rules/__init__.py b/kernel_tuner/generation/rules/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kernel_tuner/generation/rules/add_chunk_size_to_schedule_rule.py b/kernel_tuner/generation/rules/add_chunk_size_to_schedule_rule.py new file mode 100644 index 000000000..50e8c07ee --- /dev/null +++ b/kernel_tuner/generation/rules/add_chunk_size_to_schedule_rule.py @@ -0,0 +1,39 @@ +from kernel_tuner.generation.rules.rule import RuleABC +from kernel_tuner.generation.tree.tree import Tree +from kernel_tuner.generation.code.context import Context +from kernel_tuner.generation.token.pragma_token import PRAGMA_TOKEN_TYPE, PRAGMA_KEYWORDS, build_pragma_token +from kernel_tuner.generation.code.line import Line +from kernel_tuner.util import write_file +from kernel_tuner.generation.utils.util import * + +# Only static schedule kind is supported for GPU +class AddChunkSizeToScheduleRule(RuleABC): + + def __init__(self, tree: Tree, context: Context, initila_params: dict): + super().__init__(tree, context, initila_params) + + def run(self, debug_file=None): + pragma_for = filter_pragmas_contains_keyword(self.tree.pragma_tokens, [PRAGMA_KEYWORDS.SCHEDULE]) + new_params = self.generate_param() + for pragma_for_child in pragma_for: + if PRAGMA_KEYWORDS.SCHEDULE in pragma_for_child.meta: + schedule = pragma_for_child.meta[PRAGMA_KEYWORDS.SCHEDULE].strip() + if schedule == 'static': + pragma_for_child.meta[PRAGMA_KEYWORDS.SCHEDULE] = 'static, chunk_size' + pragma_for_child.modify_keywords([], pragma_for_child.meta) + self.context.offer_with_new_token([pragma_for_child], [pragma_for_child], self.rule_id, new_params) + + + def generate_param(self) -> PragmaTuneParams: + param_name = self.context.get_tune_param_unique_name('chunk_size') + return self.initial_params + [ + (PRAGMA_KEYWORDS.SCHEDULE, param_name, ['16', '32']) + ] + +""" + +#pragma omp target parallel for num_threads(nthreads) + +#pragma omp target parallel num_threads(16) +#pragma omp for schedule(dynamic) +""" \ No newline at end of file diff --git a/kernel_tuner/generation/rules/add_num_threads_and_distribute_rule.py b/kernel_tuner/generation/rules/add_num_threads_and_distribute_rule.py new file mode 100644 index 000000000..cee3d6e70 --- /dev/null +++ b/kernel_tuner/generation/rules/add_num_threads_and_distribute_rule.py @@ -0,0 +1,48 @@ +from kernel_tuner.generation.rules.rule import RuleABC +from kernel_tuner.generation.tree.tree import Tree +from kernel_tuner.generation.code.context import Context +from kernel_tuner.generation.token.pragma_token import PRAGMA_TOKEN_TYPE, PRAGMA_KEYWORDS, build_pragma_token +from kernel_tuner.generation.code.line import Line +from kernel_tuner.util import write_file +from kernel_tuner.generation.utils.util import * + +class AddNumThreadsAndDistributeRule(RuleABC): + + def __init__(self, tree: Tree, context: Context, initila_params: dict): + super().__init__(tree, context, initila_params) + + def run(self, debug_file=None): + + parallel_pragmas = filter_pragmas_by_type(self.tree.pragma_tokens, PRAGMA_TOKEN_TYPE.PARALLEL) + parallel_for_pragmas = filter_pragmas_contains_keyword(parallel_pragmas, PRAGMA_KEYWORDS.FOR) + + old_tokens = [] + new_tokens = [] + new_tune_params = self.generate_param() + new_meta = dict(map(lambda x: (x[0], x[1]), new_tune_params)) + for parallel_for_pragma in parallel_for_pragmas: + new_node = build_pragma_token( + PRAGMA_TOKEN_TYPE.TEAMS, + [PRAGMA_KEYWORDS.NUM_TEAMS, PRAGMA_KEYWORDS.DISTRIBUTE, PRAGMA_KEYWORDS.PARALLEL, PRAGMA_KEYWORDS.FOR, PRAGMA_KEYWORDS.NUM_THREADS], + parallel_for_pragma.line.line_number, + meta = new_meta, + is_target_used=True, + ) + old_tokens.append(parallel_for_pragma) + new_tokens.append(new_node) + self.context.offer_with_new_token(old_tokens, new_tokens, self.rule_id, new_tune_params) + + def generate_param(self) -> PragmaTuneParams: + param_name = self.context.get_tune_param_unique_name('nteams') + return self.initial_params + [ + (PRAGMA_KEYWORDS.NUM_TEAMS, param_name, ['1', '2']) + ] + +""" +#pragma omp target parallel num_threads(nthreads) +#pragma omp for schedule(static) + + +#pragma omp target parallel num_threads(nthreads) +#pragma omp for schedule(static, chunk_size) +""" \ No newline at end of file diff --git a/kernel_tuner/generation/rules/add_schedule_rule.py b/kernel_tuner/generation/rules/add_schedule_rule.py new file mode 100644 index 000000000..a06a32f6c --- /dev/null +++ b/kernel_tuner/generation/rules/add_schedule_rule.py @@ -0,0 +1,56 @@ +from kernel_tuner.generation.rules.rule import RuleABC +from kernel_tuner.generation.tree.tree import Tree +from kernel_tuner.generation.code.context import Context +from kernel_tuner.generation.token.pragma_token import PRAGMA_TOKEN_TYPE, PRAGMA_KEYWORDS, build_pragma_token +from kernel_tuner.generation.code.line import Line +from kernel_tuner.util import write_file +from kernel_tuner.generation.utils.util import * + +# Only static schedule kind is supported for GPU +class AddStaticScheduleRule(RuleABC): + + def __init__(self, tree: Tree, context: Context, initila_params: dict): + super().__init__(tree, context, initila_params) + + def run(self, debug_file=None): + new_params = self.generate_param() + pragma_for = filter_pragmas_contains_keyword(self.tree.pragma_tokens, [PRAGMA_KEYWORDS.FOR], [PRAGMA_KEYWORDS.SCHEDULE]) + for pragma_for_child in pragma_for: + # uppper node -> pragma without for + pragma_for_child.modify_keywords([], replace_keywords=[PRAGMA_KEYWORDS.FOR]) + + new_node = build_pragma_token( + PRAGMA_TOKEN_TYPE.FOR, + [PRAGMA_KEYWORDS.SCHEDULE], + pragma_for_child.initial_line.line_number, + {PRAGMA_KEYWORDS.SCHEDULE: 'scedule_type'}, + is_target_used=False + ) + + for ch in pragma_for_child.children: + new_node.append_child(ch) + pragma_for_child.remove_child(ch) + pragma_for_child.append_child(new_node) + + self.context.offer_with_new_token( + [pragma_for_child], [new_node], self.rule_id, new_params + ) + + self.context.offer_with_add_pragma_above( + new_node, pragma_for_child, self.rule_id, new_params + ) + + + def generate_param(self) -> PragmaTuneParams: + param_name = self.context.get_tune_param_unique_name('scedule_type') + return self.initial_params + [ + (PRAGMA_KEYWORDS.SCHEDULE, param_name, ['static']) + ] + +""" + +#pragma omp target parallel for num_threads(nthreads) + +#pragma omp target parallel num_threads(16) +#pragma omp for schedule(dynamic) +""" \ No newline at end of file diff --git a/kernel_tuner/generation/rules/rule.py b/kernel_tuner/generation/rules/rule.py new file mode 100644 index 000000000..dc7a11b7d --- /dev/null +++ b/kernel_tuner/generation/rules/rule.py @@ -0,0 +1,22 @@ +from abc import ABC, abstractmethod +from kernel_tuner.generation.tree.tree import Tree +from kernel_tuner.generation.code.context import Context +from kernel_tuner.generation.utils.util import PragmaTuneParams +import random + + +class RuleABC(ABC): + + def __init__(self, tree: Tree, context: Context, initila_params: PragmaTuneParams): + self.tree = tree + self.context = context + self.initial_params = initila_params + self.rule_id = random.randint(1, 100) + + @abstractmethod + def run(self, debug_file=None): + pass + + @abstractmethod + def generate_param(self): + pass \ No newline at end of file diff --git a/kernel_tuner/generation/token/__init__.py b/kernel_tuner/generation/token/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kernel_tuner/generation/token/code_token.py b/kernel_tuner/generation/token/code_token.py new file mode 100644 index 000000000..9e352a6f6 --- /dev/null +++ b/kernel_tuner/generation/token/code_token.py @@ -0,0 +1,275 @@ +from __future__ import annotations +from kernel_tuner.generation.token.token import * +from kernel_tuner.generation.code.code import Code, CodeBlock +import re + + +class CodeToken(Token): + + def __init__(self, line: Line, content: CodeBlock, type: TOKEN_TYPE) -> None: + super().__init__(line, content, type) + self.__detect_children() + + + def __detect_children(self): + idx = 1 + rest_possible_types = [ + CodeToken.build_function_code_token, + CodeToken.build_variable_declaration, + CodeToken.build_variable_assignment, + CodeToken.build_variable_reassignment + ] + + while idx < self.content.size(): + + code_token = None + next_line = self.content.get(idx) + if not next_line: + return + + if next_line.startswith('{'): + + if self.type == TOKEN_TYPE.FOR or self.type == TOKEN_TYPE.IF: + idx+=1 + continue + + code_token = CodeToken.build_block_code_token(next_line, CodeBlock(self.content.lines[idx:])) + + elif next_line.startswith('for'): + code_token = CodeToken.build_for_code_token(next_line, CodeBlock(self.content.lines[idx:])) + + elif next_line.startswith('}'): + idx+=1 + continue + + elif next_line.startswith('if'): + code_token = CodeToken.build_if_code_token(next_line, CodeBlock(self.content.lines[idx:])) + + else: + for f in rest_possible_types: + code_token = f(next_line, CodeBlock(self.content.lines[idx:])) + if code_token: + break + + if code_token: + self.append_child(code_token) + idx+=len(code_token.content.lines) + else: + return + + + @staticmethod + def build_code_token(line: Line, initial_code_block: CodeBlock) -> CodeToken | None: + if line.startswith('{'): + return CodeToken.build_block_code_token(line, initial_code_block) + elif line.startswith('for'): + return CodeToken.build_for_code_token(line, initial_code_block) + elif line.startswith('if'): + return CodeToken.build_if_code_token(line, initial_code_block) + else: + return CodeToken.build_function_code_token(line, initial_code_block) + + @staticmethod + def build_block_code_token(statrt_line: Line, initial_code: CodeBlock) -> CodeToken|None: + node_content = [] + open_braces_count = 0 + idx = 0 + line = statrt_line + while(True): + node_content.append(line) + if line.is_open_brace(): + open_braces_count+=1 + elif line.is_close_brace(): + open_braces_count-=1 + idx+=1 + if open_braces_count == 0: + break + line = initial_code.get(idx) + if not line: + return None + return CodeToken(statrt_line, CodeBlock(node_content), TOKEN_TYPE.BLOCK) + + + @staticmethod + def build_for_code_token(start_line: Line, initial_code: CodeBlock) -> CodeToken|None: + node_content = [] + open_braces_count = 0 + idx = 0 + line = start_line + while True: + node_content.append(line) + if line.is_open_brace(): + open_braces_count+=1 + if len(node_content) == 1 and not line.is_open_brace(): + idx+=1 + next_line = initial_code.get(idx) + if not next_line: + return None + if next_line.is_open_brace(): + open_braces_count+=1 + node_content.append(next_line) + else: + return CodeToken.build_one_line_for_code_token(start_line, next_line) + elif line.is_close_brace(): + open_braces_count-=1 + idx+=1 + if open_braces_count == 0: + break + line = initial_code.get(idx) + if not line: + print("TODO NOT LINE") + break + return CodeToken(start_line, CodeBlock(node_content), TOKEN_TYPE.FOR) + + @staticmethod + def build_one_line_for_code_token(current_line: Line, next_line: Line) -> CodeToken|None: + open_parenthesis_count = 1 + start_index = current_line.find('(') + last_paranthesis_idx = start_index + if not start_index: + return None + for idx, char in enumerate(current_line.content[start_index+1:]): + if open_parenthesis_count == 0: + break + if char == '(': + open_parenthesis_count+=1 + elif char == ')': + open_parenthesis_count-=1 + last_paranthesis_idx=idx + + if (last_paranthesis_idx + start_index + 1) == current_line.len() - 1: + return CodeToken(current_line, CodeBlock([current_line, next_line]), TOKEN_TYPE.FOR) + return CodeToken(current_line, CodeBlock([current_line]), TOKEN_TYPE.FOR) + + + @staticmethod + def build_function_code_token(start_line: Line, initial_code: CodeBlock): + idx = 0 + line = start_line + node_content = [] + function_call_pattern = r'^\b[a-zA-Z_][a-zA-Z0-9_]*\s*\([^;{}]*\)\s*;?\s*$' + function_start_call_patter = r'^\b[a-zA-Z_][a-zA-Z0-9_]*\s*\(\s*$' + while True: + node_content.append(line) + content_as_string = '\n'.join(list(map(lambda x: x.content, node_content))) + match = re.search(function_call_pattern, content_as_string, re.DOTALL | re.MULTILINE) + if not match: + if len(node_content) == 1: + match = re.search(function_start_call_patter, line.content) + if not match: + return None + idx+=1 + line = initial_code.get(idx) + if not line: + print("TODO: NOT LINE!") + return None + continue + # check only one function is called! + # if idx_line != line.len()-1: + # return None + break + return CodeToken(start_line, CodeBlock(node_content), TOKEN_TYPE.FUNCTION_CALL) + + @staticmethod + def build_if_code_token(start_line: Line, initial_code: CodeBlock) -> CodeToken|None: + node_content = [] + open_braces_count = 0 + idx = 0 + line = start_line + while(True): + node_content.append(line) + if line.is_open_brace(): + open_braces_count+=1 + if line.is_close_brace(): + open_braces_count-=1 + if len(node_content) == 1 and not line.is_open_brace(): + idx+=1 + next_next_line = initial_code.get(idx) + if not next_next_line: + return None # for now we don't support if(a) foo()s + if next_next_line.is_open_brace(): + open_braces_count+=1 + node_content.append(next_next_line) + else: + return CodeToken.build_one_line_if_code_block(start_line, initial_code) + if open_braces_count == 0: + idx+=1 + next_next_line = initial_code.get(idx) + if not next_next_line or not next_next_line.content.startswith('else'): + break + node_content.append(next_next_line) + idx+=1 + line = initial_code.get(idx) + if not line: + print("TODO NOT LINE") + break + return CodeToken(start_line, CodeBlock(node_content), TOKEN_TYPE.IF) + + @staticmethod + def build_one_line_if_code_block(start_line: Line, initial_code: CodeBlock) -> CodeToken|None: + """ + if(a) foo() + For now - we can avoid this. Think about it in a future! + """ + node_content = [] + line = start_line + node_content.append(line) + idx = 1 + next_line = initial_code.get(idx) + if not next_line: + print("IF NONE ONE LINE TODO") + return None + node_content.append(next_line) + idx+=1 + else_line = initial_code.get(idx) + if else_line and else_line.startswith('else'): + node_content.append(else_line) + idx+=1 + else_next_line = initial_code.get(idx) + if not else_next_line: + print("ELSE ONT LINE NONE TODO!") + return None + node_content.append(else_next_line) + currentToken = CodeToken(start_line, CodeBlock(node_content), TOKEN_TYPE.IF) + return currentToken + + @staticmethod + def build_variable_declaration(start_line: Line, initial_code: CodeBlock) -> CodeToken|None: + pattern = r'^\b(?:int|float|double|char|long|short)\s+[a-zA-Z_][a-zA-Z0-9_]*\s*;' + match = re.search(pattern, start_line.content) + if match: + return CodeToken(start_line, CodeBlock([start_line]), TOKEN_TYPE.VARIABLE_DECLARATION) + return None + + @staticmethod + def build_variable_assignment(start_line: Line, initial_code: CodeBlock) -> CodeToken|None: + pattern = r'^\b(?:int|float|double|char|long|short)\s+[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*[^;]*;' + match = re.search(pattern, start_line.content) + if match: + return CodeToken(start_line, CodeBlock([start_line]), TOKEN_TYPE.VARIABLE_ASSIGNMENT) + return None + + @staticmethod + def build_variable_reassignment(start_line: Line, initial_code: CodeBlock) -> CodeToken|None: + pattern = r'^(?!\s*(?:int|float|double|char|long|short)\s)\b[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*[^;]*;' + match = re.search(pattern, start_line.content) + if match: + return CodeToken(start_line, CodeBlock([start_line]), TOKEN_TYPE.VARIABLE_REASSIGNMENT) + return None + + @staticmethod + def generate_new_code_token( + content: list[Line], + type: TOKEN_TYPE + ) -> CodeToken: + return CodeToken(content[0], CodeBlock(content), type) + + def print(self, debug=False) -> str: + result = f"id: {self.id}\n" + result += f"type: {self.type}\n" + result += f"line_start: {self.line.content}\n" + result += f"children: {list(map(lambda x: x.id, self.children))}\n" + + if debug: + result += f"content: \n {self.print_content()}\n" + return result \ No newline at end of file diff --git a/kernel_tuner/generation/token/pragma_token.py b/kernel_tuner/generation/token/pragma_token.py new file mode 100644 index 000000000..b0c5331da --- /dev/null +++ b/kernel_tuner/generation/token/pragma_token.py @@ -0,0 +1,152 @@ +from __future__ import annotations +import copy +from kernel_tuner.generation.token.token import * +from kernel_tuner.generation.code.line import Line +from kernel_tuner.generation.code.code import CodeBlock, Code +import numpy as np +import re + + +class PragmaToken(Token): + + def __init__(self, line: Line, level) -> None: + self.initial_line = copy.deepcopy(line) + super().__init__(line, CodeBlock([line]), TOKEN_TYPE.PRAGMA) + self.content = CodeBlock([line]) + self.line.replace('#pragma omp') + if self.line.startswith('target'): + self.is_target_used = True + self.line.replace('target') + else: + self.is_target_used = False + + self.pragma_type = self.__detect__pragma_type() + (self.keywords, self.meta) = self.__detect_keywords() + self.pragma_children = [] + + if self.pragma_type.is_data(): + self.level = 1 + else: + self.level = level + + def append_child(self, child): + super().append_child(child) + if type(child) is not PragmaToken: + new_content = [self.initial_line] + child.content.lines + self.content = CodeBlock(new_content) + else: + self.pragma_children.append(child) + + def remove_child(self, child): + super().remove_child(child) + + def modify_keywords( + self, + new_keywords: list[PRAGMA_KEYWORDS], + meta: dict[PRAGMA_KEYWORDS, str] = {}, + replace_keywords: list[PRAGMA_KEYWORDS] = [] + ): + new_kw = list(filter(lambda x: x not in replace_keywords, self.keywords)) + new_kw += [x for x in new_keywords if x not in new_kw] + self.keywords = new_kw + self.meta.update(meta) + self.__rebuild() + + def find_first_pragma(self, type: PRAGMA_TOKEN_TYPE) -> PragmaToken|None: + queue:list[PragmaToken] = [self] + results = [] + self.__bfs_pragma(type, queue, results) + if len(results) > 0: + return results.pop(0) + return None + + def find_all_pragma(self, type: PRAGMA_TOKEN_TYPE) -> list[PragmaToken]: + queue:list[PragmaToken] = [self] + results = [] + self.__bfs_pragma(type, queue, results) + return results + + def __bfs_pragma(self, type: PRAGMA_TOKEN_TYPE, queue: list[PragmaToken], results: list[PragmaToken]): + if len(queue) == 0: + return None + cur_tkn = queue.pop(0) + for ch in cur_tkn.pragma_children: + if ch.pragma_type == type: + results.append(ch) + queue.append(ch) + self.__bfs_pragma(type, queue, results) + + def __detect__pragma_type(self) -> PRAGMA_TOKEN_TYPE: + if self.line.startswith('enter data'): + return PRAGMA_TOKEN_TYPE.DATA_ENTER + elif self.line.startswith('exit data'): + return PRAGMA_TOKEN_TYPE.DATA_EXIT + for ptk in PRAGMA_TOKEN_TYPE: + if self.line.startswith(ptk.name.lower()): + return ptk + return PRAGMA_TOKEN_TYPE.UNKNOWN + + def __detect_keywords(self) -> tuple[list[PRAGMA_KEYWORDS], dict[PRAGMA_KEYWORDS, str]]: + keywords_result = [] + keywords_map = {} + pattern_with_parentheses = re.compile(r'({})\(.*\)'.format('|'.join(PRAGMA_KEYWORDS_VALUES))) + pattern_exact = re.compile(r'({})'.format('|'.join(PRAGMA_KEYWORDS_VALUES))) + line = self.line + for word in line.content.split(): + is_pattern_with_parentheses = pattern_with_parentheses.match(word) + is_pattern_exact = pattern_exact.match(word) + if is_pattern_with_parentheses: + for key_word in PRAGMA_KEYWORDS_VALUES: + if re.match(r'{}\(.*\)'.format(key_word), word): + if '(' in word: + param = re.split(r'[()]', word) + keywords_map[PRAGMA_KEYWORDS(param[0].strip())] = param[1].strip() + keywords_result.append(PRAGMA_KEYWORDS(key_word)) + elif is_pattern_exact: + keywords_result.append(PRAGMA_KEYWORDS(is_pattern_exact.group(0))) + + return (keywords_result, keywords_map) + + def __rebuild(self): + kws = '' + for kw in self.keywords: + kws += f"{kw.name.lower()} " + if kw in self.meta: + kws += f"({self.meta[kw]}) " + target = " target " if self.is_target_used else "" + self.initial_line = Line(f"#pragma omp{target} {kws}", self.initial_line.line_number) + + def print(self, debug=False) -> str: + result = f"id: {self.id}\n" + result += f"type: {self.type}\n" + result += f"level: {self.level}\n" + result += f"pragma_type: {self.pragma_type}\n" + result += f"keywords: {list(map(lambda x: x.name, self.keywords))}\n" + result += f"META: {self.meta}\n" + result += f"line_start: {self.initial_line.content}\n" + result += f"children: {list(map(lambda x: x.id, self.children))}\n" + if debug: + result += f"content: \n {self.print_content()}\n" + return result + + +def build_pragma_token( + type: PRAGMA_TOKEN_TYPE, + keywords: list[PRAGMA_KEYWORDS], + line_number: int, + meta: dict[PRAGMA_KEYWORDS, str] = {}, + is_target_used: bool = True, + level: int|None = None + ) -> PragmaToken: + + target = "target" if is_target_used else "" + + pragma_type_str = f"{type.name.lower()}" + + for keyword in keywords: + pragma_type_str += f" {keyword.name.lower()}" + if keyword in meta: + pragma_type_str += f" ({meta[keyword]})" + + line = f"#pragma omp {target} {pragma_type_str}" + return PragmaToken(Line(line, line_number), level if level else 0) \ No newline at end of file diff --git a/kernel_tuner/generation/token/token.py b/kernel_tuner/generation/token/token.py new file mode 100644 index 000000000..c54a75022 --- /dev/null +++ b/kernel_tuner/generation/token/token.py @@ -0,0 +1,149 @@ +from __future__ import annotations +from enum import Enum +from abc import ABC, abstractmethod +from kernel_tuner.generation.code.line import Line +from kernel_tuner.generation.code.code import Code, CodeBlock +import random + +class PRAGMA_TOKEN_TYPE(Enum): + ROOT = 1 + DATA_ENTER = 2 + DATA_EXIT = 3 + TEAMS = 4 + PARALLEL = 5 + DISTRIBUTE = 6 + SECTIONS = 7 + SINGLE = 8 + SIMD = 9 + DECLARE = 10 + TASK = 11 + TASKLOOP = 12 + TASKYIELD = 13 + UPDATE = 14 + MASTER = 15 + CRITICAL = 16 + BARRIER = 17 + TASKWAIT = 18 + TASKGROUP = 19 + ATOMIC = 20 + FLUSH = 21 + ORDERED = 22 + CANCEL = 23 + THREADPRIVATE = 24 + FOR = 25 + UNKNOWN = 26 + + + def is_data(self): + return True if self is PRAGMA_TOKEN_TYPE.DATA_ENTER or self is PRAGMA_TOKEN_TYPE.DATA_EXIT else False + + +class PRAGMA_KEYWORDS(Enum): + PARALLEL = 'parallel' + FOR = 'for' + SIMD = 'simd' + TASK = 'task' + TEAMS = 'teams' + DISTRIBUTE = 'distribute' + UNKNOWN = 'unknown' + NUM_THREADS = 'num_threads' + NUM_TEAMS = 'num_teams' + DEFAULT = 'default' + PRIVATE = 'private' + FIRST_PRIVATE = 'firstprivate' + LATS_PRIVATE = 'lastprivate' + COPY_PRIVATE = 'copyprivate' + LINEAR = 'linear' + SCHEDULE = 'schedule' + ORDERED = 'ordered' + NOWAIT = 'nowait' + SHARED = 'shared' + COPY_IN = 'copyin' + REDUCTION = 'reduction' + PROC_BIND = 'proc_bind' + SAFELEN = 'safelen' + SIMDLEN = 'simdlen' + ALIGNED = 'aligned' + UNIFORM = 'uniform' + IN_BRANCH = 'inbranch' + NOT_IN_BRANCH = 'notinbranch' + FINAL = 'final' + UNTIED = 'untied' + MERGEABLE = 'mergeable' + PRIORITY = 'priority' + GRAIN_SIZE = 'grainsize' + NUM_TASKS = 'num_tasks' + NO_GROUP = 'nogroup' + TARGET = 'target' + DATA = 'data' + DEVICE = 'device' + MAP = 'map' + DEPEND = 'depend' + IS_DEVICE_PTR = 'is_device_ptr' + DEFAULT_MAP = 'defaultmap' + DIST_SCHEDULE = 'dist_schedule' + + + +PRAGMA_KEYWORDS_VALUES = list(map(lambda x: x.value, PRAGMA_KEYWORDS)) + +class TOKEN_TYPE(Enum): + BLOCK = 1 + FOR = 2 + IF = 3 + IF_ELSE = 4 + FUNCTION_CALL = 5 + PRAGMA = 6 + VARIABLE_ASSIGNMENT = 7 + VARIABLE_DECLARATION = 8 + VARIABLE_REASSIGNMENT = 9 + COMPLEX_STRUCTURE = 10 + + +class Token(ABC): + + def __init__(self, line: Line, content: CodeBlock, type: TOKEN_TYPE): + self.line = line + self.content = content + self.type = type + self.children: list[Token] = [] + self.parent: Token | None = None + self.id = random.randint(1, 100) + pass + + def append_child(self, child): + self.children.append(child) + child.parent = self + + def remove_child(self, child): + self.children.remove(child) + child.parent = None + + def print_content(self) -> str: + return '\n'.join(list(map(lambda x: x.content, self.content.lines))) + + def find_first(self, type: TOKEN_TYPE) -> Token|None: + queue:list[Token] = [self] + results = [] + self.__bfs(type, queue, results) + return results.pop(0) + + def find_all(self, type: TOKEN_TYPE) -> list[Token]: + queue:list[Token] = [self] + results = [] + self.__bfs(type, queue, results) + return results + + def __bfs(self, type: TOKEN_TYPE, queue: list[Token], results: list[Token]): + if len(queue) == 0: + return None + cur_tkn = queue.pop(0) + for ch in cur_tkn.children: + if ch.type == type: + results.append(ch) + queue.append(ch) + self.__bfs(type, queue, results) + + @abstractmethod + def print(self, debug=False) -> str: + pass \ No newline at end of file diff --git a/kernel_tuner/generation/tree/__init__.py b/kernel_tuner/generation/tree/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kernel_tuner/generation/tree/tree.py b/kernel_tuner/generation/tree/tree.py new file mode 100644 index 000000000..49131f444 --- /dev/null +++ b/kernel_tuner/generation/tree/tree.py @@ -0,0 +1,128 @@ +from kernel_tuner.generation.token.pragma_token import PragmaToken +from kernel_tuner.generation.token.token import * +from kernel_tuner.generation.token.code_token import * +from kernel_tuner.generation.code.line import Line +from kernel_tuner.generation.code.code import Code, CodeBlock +from kernel_tuner.util import write_file + +class Tree: + + def __init__(self, root: PragmaToken, pragma_tokens: list[PragmaToken]) -> None: + self.root = root + self.root.pragma_type = PRAGMA_TOKEN_TYPE.ROOT + self.pragma_tokens = pragma_tokens + + def append_node(self, parent: Token, child: Token, insert_token: Token): + if child not in parent.children: + print("ERRPR PARENT NOT REAL PARENT!") + return + parent.remove_child(child) + parent.append_child(insert_token) + insert_token.append_child(child) + + + def replace_node(self, replace_token: Token, new_token: Token): + parent = replace_token.parent + if not parent: + return + parent.remove_child(replace_token) + parent.append_child(new_token) + + def dfs(self, func, *args): + self.__dfs_loop(self.root, func, *args) + + def __dfs_loop(self, node, func, *args): + func(node, *args) + for children in node.pragma_children: + self.__dfs_loop(children, func, *args) + + + def dfs_print(self, node = None, debug_file_name=None): + cur_node = node if node else self.root + if debug_file_name: + write_file(debug_file_name, cur_node.print(True) + '\n', "a") + else: + print(cur_node.print(True)) + for child in cur_node.children: + self.dfs_print(child, debug_file_name) + + + +class TreeBuilder: + + def __init__(self, code: Code) -> None: + self.tree_root = PragmaToken(Line('', 0), 0) + self.node_map = {0: self.tree_root} + self.current_level = 1 + self.current_bracket_level = 0 + self.last_bracket_level = 0 + self.code = code + self.pragma_tokens = [] + self.idx = 0 + + def build_tree(self): + while(self.idx < self.code.num_lines): + line = self.code.lines[self.idx] + if line.is_open_brace(): + self.current_bracket_level+=1 + if line.is_close_brace(): + self.current_bracket_level-=1 + + if line.startswith("#pragma omp"): + node = self.__build_pragma_token(line) + + if not node.level - 1 in self.node_map: + print("ERROR!!") + exit(0) + + self.node_map[node.level - 1].append_child(node) + self.node_map[node.level] = node + + self.pragma_tokens.append(node) + + self.idx+=1 + + self.__build_code_token() + return Tree(self.tree_root, self.pragma_tokens) + + + def __build_pragma_token(self, line) -> PragmaToken: + if self.code.lines[self.idx-1].startswith('#pragma omp') and not self.pragma_tokens[len(self.pragma_tokens)-1].type.is_data(): + self.current_level+=1 + else: + if self.current_bracket_level > self.last_bracket_level: + self.current_level+=1 + self.last_bracket_level = self.current_bracket_level + elif self.current_bracket_level < self.last_bracket_level: + self.current_level-=1 + self.last_bracket_level = self.current_bracket_level + + line = line + if line.endswith('\\'): + while(True): + line.replace('\\') + self.idx+=1 + next_line = self.code.lines[self.idx] + line += next_line + if not next_line.endswith('\\'): + break + self.idx-=1 + + node = PragmaToken(line, self.current_bracket_level) + return node + + + def __build_code_token(self): + for node in self.pragma_tokens: + if node.pragma_type == PRAGMA_TOKEN_TYPE.ROOT: + continue + idx = node.line.line_number + 1 + if idx >= self.code.num_lines: + return + next_line = self.code.initial_lines[idx] + if next_line.startswith('#pragma omp'): + continue + code_token = CodeToken.build_code_token(next_line, CodeBlock(self.code.initial_lines[idx:])) + if code_token: + node.append_child(code_token) + diff --git a/kernel_tuner/generation/utils/util.py b/kernel_tuner/generation/utils/util.py new file mode 100644 index 000000000..ddccab356 --- /dev/null +++ b/kernel_tuner/generation/utils/util.py @@ -0,0 +1,21 @@ +from kernel_tuner.generation.token.pragma_token import * +from typing import TypeAlias + +PragmaTuneParams: TypeAlias = list[tuple[PRAGMA_KEYWORDS, str, list[str]]] + +def convertPragmaTuneToDict(pragmaTuneParams: PragmaTuneParams) -> dict[str, list[str]]: + return dict(map(lambda x: (x[1], x[2]), pragmaTuneParams)) + +def filter_pragmas_by_type(pragmas: list[PragmaToken], type: PRAGMA_TOKEN_TYPE) -> list[PragmaToken]: + return list(filter(lambda x: x.pragma_type == type, pragmas)) + +def filter_pragmas_contains_keyword( + pragmas: list[PragmaToken], + contains_keywords: list[PRAGMA_KEYWORDS], + exclude_keywords: list[PRAGMA_KEYWORDS] = [] +) -> list[PragmaToken]: + resulst = [] + for pragma in pragmas: + if (any(x in contains_keywords for x in pragma.keywords) and all(x not in exclude_keywords for x in pragma.keywords) ): + resulst.append(pragma) + return resulst \ No newline at end of file diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 97ae22848..1e0bd764e 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -31,11 +31,14 @@ import kernel_tuner.core as core import kernel_tuner.util as util +import kernel_tuner.utils.directives as directives_util from kernel_tuner.integration import get_objective_defaults from kernel_tuner.runners.sequential import SequentialRunner from kernel_tuner.runners.simulation import SimulationRunner from kernel_tuner.searchspace import Searchspace +from kernel_tuner.generation.generation import generate_kernel_sources + try: import torch except ImportError: @@ -546,8 +549,8 @@ def tune_kernel( kernel_name, kernel_source, problem_size, - arguments, tune_params, + arguments=None, grid_div_x=None, grid_div_y=None, grid_div_z=None, @@ -564,6 +567,7 @@ def tune_kernel( texmem_args=None, compiler=None, compiler_options=None, + directive=None, defines=None, log=None, iterations=7, @@ -582,6 +586,9 @@ def tune_kernel( if log: logging.basicConfig(filename=kernel_name + datetime.now().strftime("%Y%m%d-%H:%M:%S") + ".log", level=log) + if directive: + kernel_source, arguments = directives_util.preprocess_directive_source(kernel_name, kernel_source, directive) + kernelsource = core.KernelSource(kernel_name, kernel_source, lang, defines) _check_user_input(kernel_name, kernelsource, arguments, block_size_names) @@ -735,13 +742,97 @@ def tune_kernel( _device_options ) +def auto_tune_kernel( + kernel_name, + kernel_source, + problem_size, + tune_params, + arguments=None, + grid_div_x=None, + grid_div_y=None, + grid_div_z=None, + restrictions=None, + answer=None, + atol=1e-6, + verify=None, + verbose=False, + lang=None, + device=0, + platform=0, + smem_args=None, + cmem_args=None, + texmem_args=None, + compiler=None, + compiler_options=None, + directive=None, + defines=None, + log=None, + iterations=7, + block_size_names=None, + quiet=False, + strategy=None, + strategy_options=None, + cache=None, + metrics=None, + simulation_mode=False, + observers=None, + objective=None, + objective_higher_is_better=None, +): + + initial_kernel_source, arguments = directives_util.preprocess_directive_source(kernel_name, kernel_source, directive) + + debug_file = util.get_temp_filename() + + + generated_sources = generate_kernel_sources(initial_kernel_source, tune_params, debug_file) + + opts = locals() + kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()]) + device_options = Options([(k, opts[k]) for k in _device_options.keys()]) + + strategy = brute_force + + util.write_file(debug_file, '='*10 + 'LOOP' + '='*10 + '\n\n', "a") + for (generated_code, generated_tune_params) in generated_sources: + + util.write_file(debug_file, '='*10 + 'CODE' + '='*10 + f"\n{generated_code.to_text()}\n\n", "a") + + kernelsource = core.KernelSource(kernel_name, generated_code.to_text(), lang, defines) + tune_params = generated_tune_params + + opts = locals() + tuning_options = Options([(k, opts[k]) for k in _tuning_options.keys()]) + tuning_options["unique_results"] = {} + if strategy_options and "max_fevals" in strategy_options: + tuning_options["max_fevals"] = strategy_options["max_fevals"] + if strategy_options and "time_limit" in strategy_options: + tuning_options["time_limit"] = strategy_options["time_limit"] + tuning_options.simulated_time = 0 + + util.write_file(debug_file, f"\n'{generated_tune_params}\n", "a") + + runner = SequentialRunner(kernelsource, kernel_options, device_options, iterations, observers) + + tuning_options.verify = util.normalize_verify_function(tuning_options.verify) + + searchspace = Searchspace(tune_params, restrictions, runner.dev.max_threads) + restrictions = searchspace._modified_restrictions + tuning_options.restrictions = restrictions + + results = strategy.tune(searchspace, runner, tuning_options) + util.write_file(debug_file, f"\n'{results}\n", "a") + + print("Done!") + + def run_kernel( kernel_name, kernel_source, problem_size, - arguments, params, + arguments=None, grid_div_x=None, grid_div_y=None, grid_div_z=None, @@ -753,6 +844,7 @@ def run_kernel( texmem_args=None, compiler=None, compiler_options=None, + directive=None, defines=None, block_size_names=None, quiet=False, @@ -761,6 +853,9 @@ def run_kernel( if log: logging.basicConfig(filename=kernel_name + datetime.now().strftime("%Y%m%d-%H:%M:%S") + ".log", level=log) + if directive: + kernel_source, arguments = directives_util.preprocess_directive_source(kernel_name, kernel_source, directive) + kernelsource = core.KernelSource(kernel_name, kernel_source, lang, defines) _check_user_input(kernel_name, kernelsource, arguments, block_size_names) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index aeebd5116..4d6315e27 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -69,6 +69,7 @@ def run(self, parameter_space, tuning_options): # iterate over parameter space for element in parameter_space: params = dict(zip(tuning_options.tune_params.keys(), element)) + print(f"Params: {params}") result = None warmup_time = 0 diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 0d2cef696..c73cb86dd 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -788,14 +788,14 @@ def setup_block_and_grid(problem_size, grid_div, params, block_size_names=None): return threads, grid -def write_file(filename, string): +def write_file(filename, string, mode="w"): """Dump the contents of string to a file called filename.""" # ugly fix, hopefully we can find a better one if sys.version_info[0] >= 3: - with open(filename, "w", encoding="utf-8") as f: + with open(filename, mode, encoding="utf-8") as f: f.write(string) else: - with open(filename, "w") as f: + with open(filename, mode) as f: f.write(string.encode("utf-8")) diff --git a/kernel_tuner/utils/directives.py b/kernel_tuner/utils/directives.py index 85b4181ea..e6b73af93 100644 --- a/kernel_tuner/utils/directives.py +++ b/kernel_tuner/utils/directives.py @@ -87,6 +87,19 @@ def add(self, dim: int) -> None: if dim >= 1: self.size.append(dim) +def preprocess_directive_source(kernel_name, kernel_source, directive_code): + preprocessor = extract_preprocessor(kernel_source) + signature = extract_directive_signature(kernel_source, directive_code) + body = extract_directive_code(kernel_source, directive_code) + # Allocate memory on the host + data = extract_directive_data(kernel_source, directive_code) + args = allocate_signature_memory(data[kernel_name], preprocessor) + # Generate kernel string + kernel_string = generate_directive_function( + preprocessor, signature[kernel_name], body[kernel_name], directive_code, data=data[kernel_name] + ) + return kernel_string, args + def fortran_md_size(size: ArraySize) -> list: """Format a multidimensional size into the correct Fortran string""" diff --git a/noxfile.py b/noxfile.py index e32bbb588..7112bbc9f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -15,7 +15,7 @@ # set the test parameters verbose = False -python_versions_to_test = ["3.9", "3.10", "3.11", "3.12"] +python_versions_to_test = ["3.11"] nox.options.stop_on_first_error = True nox.options.error_on_missing_interpreters = True nox.options.default_venv_backend = 'virtualenv'