diff --git a/cpp/scripts/run-clang-compile.py b/cpp/scripts/run-clang-compile.py
index 4edbde84b3..123f0e4075 100644
--- a/cpp/scripts/run-clang-compile.py
+++ b/cpp/scripts/run-clang-compile.py
@@ -20,7 +20,6 @@
 
 from __future__ import print_function
 import argparse
-import glob
 import json
 import multiprocessing as mp
 import os
@@ -29,6 +28,8 @@
 import subprocess
 
 
+CMAKE_COMPILER_REGEX = re.compile(
+    r"^\s*CMAKE_CXX_COMPILER:FILEPATH=(.+)\s*$", re.MULTILINE)
 CLANG_COMPILER = "clang++"
 GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
 SPACES = re.compile(r"\s+")
@@ -54,6 +55,10 @@ def parse_args():
         help="Regex used to select files for checking")
     argparser.add_argument(
         "-j", type=int, default=-1, help="Number of parallel jobs to launch.")
+    argparser.add_argument(
+        "-build_dir", type=str, default=None,
+        help="Directory from which compile commands should be called. "
+        "By default, directory of compile_commands.json file.")
     args = argparser.parse_args()
     if args.j <= 0:
         args.j = mp.cpu_count()
@@ -63,15 +68,39 @@ def parse_args():
     # recent enough to handle CUDA >= 11
     if not os.path.exists(args.cdb):
         raise Exception("Compilation database '%s' missing" % args.cdb)
+    if args.build_dir is None:
+        args.build_dir = os.path.dirname(args.cdb)
     return args
 
 
+def get_gcc_root(build_dir):
+    # first try to determine GCC based on CMakeCache
+    cmake_cache = os.path.join(build_dir, "CMakeCache.txt")
+    if os.path.isfile(cmake_cache):
+        with open(cmake_cache) as f:
+            content = f.read()
+        match = CMAKE_COMPILER_REGEX.search(content)
+        if match:
+            return os.path.dirname(os.path.dirname(match.group(1)))
+    # first fall-back to CONDA prefix if we have a build sysroot there
+    conda_prefix = os.environ.get("CONDA_PREFIX", "")
+    conda_sysroot = os.environ.get("CONDA_BUILD_SYSROOT", "")
+    if conda_prefix and conda_sysroot:
+        return conda_prefix
+    # second fall-back to default g++ install
+    default_gxx = shutil.which("g++")
+    if default_gxx:
+        return os.path.dirname(os.path.dirname(default_gxx))
+    raise Exception("Cannot find any g++ install on the system.")
+
+
 def list_all_cmds(cdb):
     with open(cdb, "r") as fp:
         return json.load(fp)
 
 
 def get_gpu_archs(command):
+    # clang only accepts a single architecture, so first determine the lowest
     archs = []
     for loc in range(len(command)):
         if (command[loc] != "-gencode" and command[loc] != "--generate-code"
@@ -83,8 +112,8 @@ def get_gpu_archs(command):
             arch_flag = command[loc + 1]
         match = GPU_ARCH_REGEX.search(arch_flag)
         if match is not None:
-            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
-    return archs
+            archs.append(int(match.group(1)))
+    return ["--cuda-gpu-arch=sm_%d" % min(archs)]
 
 
 def get_index(arr, item_options):
@@ -113,15 +142,10 @@ def add_cuda_path(command, nvcc):
     if not nvcc_path:
         raise Exception("Command %s has invalid compiler %s" % (command, nvcc))
     cuda_root = os.path.dirname(os.path.dirname(nvcc_path))
-    # make sure that cuda root has version.txt
-    if not os.path.isfile(os.path.join(cuda_root, "version.txt")):
-        raise Exception(
-            "clang++ expects a `version.txt` file in your CUDA root path with "
-            "content `CUDA Version <major>.<minor>.<build>`")
     command.append('--cuda-path=%s' % cuda_root)
 
 
-def get_clang_args(cmd):
+def get_clang_args(cmd, build_dir):
     command, file = cmd["command"], cmd["file"]
     is_cuda = file.endswith(".cu")
     command = re.split(SPACES, command)
@@ -208,15 +232,8 @@ def get_clang_args(cmd):
     for i, x in reversed(list(enumerate(command))):
         if x.startswith("-Werror"):
             del command[i]
-    # add GCC headers if we can find GCC
-    gcc_path = shutil.which("gcc")
-    if gcc_path:
-        gcc_base = os.path.dirname(os.path.dirname(gcc_path))
-        gcc_glob1 = os.path.join(gcc_base, "lib", "gcc", "*", "*", "include")
-        gcc_glob2 = os.path.join(gcc_base, "lib64", "gcc", "*", "*", "include")
-        inc_dirs = glob.glob(gcc_glob1) + glob.glob(gcc_glob2)
-        for d in inc_dirs:
-            command.extend(["-isystem", d])
+    # try to figure out which GCC CMAKE used, and tell clang all about it
+    command.append("--gcc-toolchain=%s" % get_gcc_root(build_dir))
     return command
 
 
@@ -257,11 +274,10 @@ def print_result(passed, stdout, file):
 
 
 def run_clang(cmd, args):
-    command = get_clang_args(cmd)
-    cwd = os.path.dirname(args.cdb)
+    command = get_clang_args(cmd, args.build_dir)
     # compile only and dump output to /dev/null
     command.extend(["-c", cmd["file"], "-o", os.devnull])
-    status, out = run_clang_command(command, cwd)
+    status, out = run_clang_command(command, args.build_dir)
     # we immediately print the result since this is more interactive for user
     with lock:
         print_result(status, out, cmd["file"])
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
index ed1a633232..49f96aa18b 100644
--- a/cpp/scripts/run-clang-tidy.py
+++ b/cpp/scripts/run-clang-tidy.py
@@ -19,199 +19,349 @@
 #
 
 from __future__ import print_function
-import sys
-import re
-import os
-import subprocess
 import argparse
 import json
 import multiprocessing as mp
+import os
+import re
+import shutil
+import subprocess
 
 
-EXPECTED_VERSION = "8.0.1"
-VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
+EXPECTED_VERSIONS = ("11.1.0",)
+VERSION_REGEX = re.compile(r"clang version ([0-9.]+)")
+CMAKE_COMPILER_REGEX = re.compile(
+    r"^\s*CMAKE_CXX_COMPILER:FILEPATH=(.+)\s*$", re.MULTILINE)
+CLANG_COMPILER = "clang++"
 GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
 SPACES = re.compile(r"\s+")
-SEPARATOR = "-" * 16
+XCOMPILER_FLAG = re.compile(r"-((Xcompiler)|(-compiler-options))=?")
+XPTXAS_FLAG = re.compile(r"-((Xptxas)|(-ptxas-options))=?")
+# any options that may have equal signs in nvcc but not in clang
+# add those options here if you find any
+OPTIONS_NO_EQUAL_SIGN = ['-isystem']
+SEPARATOR = "-" * 8
+END_SEPARATOR = "*" * 64
 
 
 def parse_args():
     argparser = argparse.ArgumentParser("Runs clang-tidy on a project")
-    argparser.add_argument("-cdb", type=str, default="compile_commands.json",
-                           help="Path to cmake-generated compilation database")
-    argparser.add_argument("-exe", type=str, default="clang-tidy",
-                           help="Path to clang-tidy exe")
-    argparser.add_argument("-ignore", type=str, default="[.]cu$",
-                           help="Regex used to ignore files from checking")
-    argparser.add_argument("-select", type=str, default=None,
-                           help="Regex used to select files for checking")
-    argparser.add_argument("-j", type=int, default=-1,
-                           help="Number of parallel jobs to launch.")
+    argparser.add_argument(
+        "-cdb", type=str, default="compile_commands.json",
+        help="Path to cmake-generated compilation database")
+    argparser.add_argument(
+        "-exe", type=str, default="clang-tidy", help="Path to clang-tidy exe")
+    argparser.add_argument(
+        "-ignore", type=str, default=None,
+        help="Regex used to ignore files from checking")
+    argparser.add_argument(
+        "-select", type=str, default=None,
+        help="Regex used to select files for checking")
+    argparser.add_argument(
+        "-j", type=int, default=-1, help="Number of parallel jobs to launch.")
+    argparser.add_argument(
+        "-root", type=str, default=None,
+        help="Repo root path to filter headers correctly, CWD by default.")
+    argparser.add_argument(
+        "-thrust_dir", type=str, default=None,
+        help="Pass the directory to a THRUST git repo recent enough for clang.")
+    argparser.add_argument(
+        "-build_dir", type=str, default=None,
+        help="Directory from which compile commands should be called. "
+        "By default, directory of compile_commands.json file.")
     args = argparser.parse_args()
     if args.j <= 0:
         args.j = mp.cpu_count()
     args.ignore_compiled = re.compile(args.ignore) if args.ignore else None
     args.select_compiled = re.compile(args.select) if args.select else None
-    ret = subprocess.check_output("%s --version" % args.exe, shell=True)
+    # we check clang's version so that it will work in CI
+    ret = subprocess.check_output("%s --version" % CLANG_COMPILER, shell=True)
     ret = ret.decode("utf-8")
-    version = VERSION_REGEX.search(ret)
+    version = VERSION_REGEX.match(ret)
     if version is None:
-        raise Exception("Failed to figure out clang-tidy version!")
+        raise Exception("Failed to figure out clang compiler version!")
     version = version.group(1)
-    if version != EXPECTED_VERSION:
-        raise Exception("clang-tidy exe must be v%s found '%s'" % \
-                        (EXPECTED_VERSION, version))
+    if version not in EXPECTED_VERSIONS:
+        raise Exception("clang compiler version must be in %s found '%s'" %
+                        (EXPECTED_VERSIONS, version))
     if not os.path.exists(args.cdb):
         raise Exception("Compilation database '%s' missing" % args.cdb)
+    # we assume that this script is run from repo root
+    if args.root is None:
+        args.root = os.getcwd()
+    args.root = os.path.realpath(os.path.expanduser(args.root))
+    # we need to have a recent enough cub version for clang to compile
+    if args.thrust_dir is None:
+        args.thrust_dir = os.path.join(
+            os.path.dirname(args.cdb), "thrust_1.15", "src", "thrust_1.15")
+    if args.build_dir is None:
+        args.build_dir = os.path.dirname(args.cdb)
+    if not os.path.isdir(args.thrust_dir):
+        raise Exception("Cannot find custom thrust dir '%s" % args.thrust_dir)
     return args
 
 
+def get_gcc_root(args):
+    # first try to determine GCC based on CMakeCache
+    cmake_cache = os.path.join(args.build_dir, "CMakeCache.txt")
+    if os.path.isfile(cmake_cache):
+        with open(cmake_cache) as f:
+            content = f.read()
+        match = CMAKE_COMPILER_REGEX.search(content)
+        if match:
+            return os.path.dirname(os.path.dirname(match.group(1)))
+    # first fall-back to CONDA prefix if we have a build sysroot there
+    conda_prefix = os.environ.get("CONDA_PREFIX", "")
+    conda_sysroot = os.environ.get("CONDA_BUILD_SYSROOT", "")
+    if conda_prefix and conda_sysroot:
+        return conda_prefix
+    # second fall-back to default g++ install
+    default_gxx = shutil.which("g++")
+    if default_gxx:
+        return os.path.dirname(os.path.dirname(default_gxx))
+    raise Exception("Cannot find any g++ install on the system.")
+
+
 def list_all_cmds(cdb):
     with open(cdb, "r") as fp:
         return json.load(fp)
 
 
 def get_gpu_archs(command):
+    # clang only accepts a single architecture, so first determine the lowest
     archs = []
     for loc in range(len(command)):
-        if command[loc] != "-gencode":
+        if (command[loc] != "-gencode" and command[loc] != "--generate-code"
+                and not command[loc].startswith("--generate-code=")):
             continue
-        arch_flag = command[loc + 1]
+        if command[loc].startswith("--generate-code="):
+            arch_flag = command[loc][len("--generate-code="):]
+        else:
+            arch_flag = command[loc + 1]
         match = GPU_ARCH_REGEX.search(arch_flag)
         if match is not None:
-            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
-    return archs
+            archs.append(int(match.group(1)))
+    return ["--cuda-gpu-arch=sm_%d" % min(archs)]
 
 
-def get_index(arr, item):
-    try:
-        return arr.index(item)
-    except:
-        return -1
+def get_index(arr, item_options):
+    return set(i for i, s in enumerate(arr) for item in item_options
+               if s == item)
 
 
-def remove_item(arr, item):
-    loc = get_index(arr, item)
-    if loc >= 0:
-        del arr[loc]
-    return loc
+def remove_items(arr, item_options):
+    for i in sorted(get_index(arr, item_options), reverse=True):
+        del arr[i]
 
 
-def remove_item_plus_one(arr, item):
-    loc = get_index(arr, item)
-    if loc >= 0:
-        del arr[loc + 1]
-        del arr[loc]
-    return loc
+def remove_items_plus_one(arr, item_options):
+    for i in sorted(get_index(arr, item_options), reverse=True):
+        if i < len(arr) - 1:
+            del arr[i + 1]
+        del arr[i]
+    idx = set(i for i, s in enumerate(arr) for item in item_options
+              if s.startswith(item + "="))
+    for i in sorted(idx, reverse=True):
+        del arr[i]
 
 
-def get_clang_includes(exe):
-    dir = os.getenv("CONDA_PREFIX")
-    if dir is None:
-        ret = subprocess.check_output("which %s 2>&1" % exe, shell=True)
-        ret = ret.decode("utf-8")
-        dir = os.path.dirname(os.path.dirname(ret))
-    header = os.path.join(dir, "include", "ClangHeaders")
-    return ["-I", header]
+def add_cuda_path(command, nvcc):
+    nvcc_path = shutil.which(nvcc)
+    if not nvcc_path:
+        raise Exception("Command %s has invalid compiler %s" % (command, nvcc))
+    cuda_root = os.path.dirname(os.path.dirname(nvcc_path))
+    command.append('--cuda-path=%s' % cuda_root)
 
 
-def get_tidy_args(cmd, exe):
+def get_tidy_args(cmd, args):
     command, file = cmd["command"], cmd["file"]
     is_cuda = file.endswith(".cu")
     command = re.split(SPACES, command)
+    # get original compiler
+    cc_orig = command[0]
     # compiler is always clang++!
     command[0] = "clang++"
     # remove compilation and output targets from the original command
-    remove_item_plus_one(command, "-c")
-    remove_item_plus_one(command, "-o")
+    remove_items_plus_one(command, ["--compile", "-c"])
+    remove_items_plus_one(command, ["--output-file", "-o"])
     if is_cuda:
+        # include our own cub before anything else
+        # (left-most should have highest priority)
+        command.insert(1, "-I%s" % args.thrust_dir)
         # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..."
         archs = get_gpu_archs(command)
         command.extend(archs)
-        while True:
-            loc = remove_item_plus_one(command, "-gencode")
-            if loc < 0:
-                break
+        # provide proper cuda path to clang
+        add_cuda_path(command, cc_orig)
+        # remove all kinds of nvcc flags clang doesn't know about
+        remove_items_plus_one(command, [
+            "--generate-code",
+            "-gencode",
+            "--x",
+            "-x",
+            "--compiler-bindir",
+            "-ccbin",
+            "--diag_suppress",
+            "-diag-suppress",
+            "--default-stream",
+            "-default-stream",
+        ])
+        remove_items(command, [
+            "-extended-lambda",
+            "--extended-lambda",
+            "-expt-extended-lambda",
+            "--expt-extended-lambda",
+            "-expt-relaxed-constexpr",
+            "--expt-relaxed-constexpr",
+            "--device-debug",
+            "-G",
+            "--generate-line-info",
+            "-lineinfo",
+        ])
         # "-x cuda" is the right usage in clang
-        loc = get_index(command, "-x")
-        if loc >= 0:
-            command[loc + 1] = "cuda"
-        remove_item_plus_one(command, "-ccbin")
-        remove_item(command, "--expt-extended-lambda")
-        remove_item(command, "--diag_suppress=unrecognized_gcc_pragma")
-    command.extend(get_clang_includes(exe))
+        command.extend(["-x", "cuda"])
+        # we remove -Xcompiler flags: here we basically have to hope for the
+        # best that clang++ will accept any flags which nvcc passed to gcc
+        for i, c in reversed(list(enumerate(command))):
+            new_c = XCOMPILER_FLAG.sub('', c)
+            if new_c == c:
+                continue
+            command[i:i + 1] = new_c.split(',')
+        # we also change -Xptxas to -Xcuda-ptxas, always adding space here
+        for i, c in reversed(list(enumerate(command))):
+            if XPTXAS_FLAG.search(c):
+                if not c.endswith("=") and i < len(command) - 1:
+                    del command[i + 1]
+                command[i] = '-Xcuda-ptxas'
+                command.insert(i + 1, XPTXAS_FLAG.sub('', c))
+        # several options like isystem don't expect `=`
+        for opt in OPTIONS_NO_EQUAL_SIGN:
+            opt_eq = opt + '='
+            # make sure that we iterate from back to front here for insert
+            for i, c in reversed(list(enumerate(command))):
+                if not c.startswith(opt_eq):
+                    continue
+                x = c.split('=')
+                # we only care about the first `=`
+                command[i] = x[0]
+                command.insert(i + 1, '='.join(x[1:]))
+        # use extensible whole program, to avoid ptx resolution/linking
+        command.extend(["-Xcuda-ptxas", "-ewp"])
+        # for libcudacxx, we need to allow variadic functions
+        command.extend(["-Xclang", "-fcuda-allow-variadic-functions"])
+        # add some additional CUDA intrinsics
+        cuda_intrinsics_file = os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            "__clang_cuda_additional_intrinsics.h")
+        command.extend(["-include", cuda_intrinsics_file])
+    # somehow this option gets onto the commandline, it is unrecognized by tidy
+    remove_items(command, [
+        "--forward-unknown-to-host-compiler",
+        "-forward-unknown-to-host-compiler"
+    ])
+    # do not treat warnings as errors here !
+    for i, x in reversed(list(enumerate(command))):
+        if x.startswith("-Werror"):
+            del command[i]
+    # try to figure out which GCC CMAKE used, and tell clang all about it
+    command.append("--gcc-toolchain=%s" % get_gcc_root(args))
     return command, is_cuda
 
 
-def run_clang_tidy_command(tidy_cmd):
+def check_output_for_errors(output):
+    # there shouldn't really be any allowed errors
+    warnings_found = 0
+    errors = []
+    for line in output.splitlines():
+        if line.find("error:") >= 0:
+            errors.append(line)
+        if line.find("warning:") >= 0:
+            warnings_found += 1
+    return warnings_found, errors
+
+
+def run_clang_tidy_command(tidy_cmd, cwd):
     cmd = " ".join(tidy_cmd)
-    result = subprocess.run(cmd, check=False, shell=True,
+    result = subprocess.run(cmd, check=False, shell=True, cwd=cwd,
                             stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    status = result.returncode == 0
-    if status:
-        out = ""
-    else:
-        out = "CMD: " + cmd
-    out += result.stdout.decode("utf-8").rstrip()
-    return status, out
+    result.stdout = result.stdout.decode("utf-8").strip()
+    out = "CMD: " + cmd + "\n"
+    out += "EXIT-CODE: %d\n" % result.returncode
+    n_warnings, errors = check_output_for_errors(result.stdout)
+    status = n_warnings == 0 and not errors
+    out += result.stdout
+    return status, out, errors
+
+
+class LockContext(object):
+    def __init__(self, lock=None) -> None:
+        self._lock = lock
+    
+    def __enter__(self):
+        if self._lock:
+            self._lock.acquire()
+        return self
+    
+    def __exit__(self, _, __, ___):
+        if self._lock:
+            self._lock.release()
+        return False  # we don't handle exceptions
+
+
+def print_result(passed, stdout, file, errors):
+    if any(errors):
+        raise Exception(
+            "File %s: got %d errors:\n%s" % (file, len(errors), stdout))
+    status_str = "PASSED" if passed else "FAILED"
+    print("%s File:%s %s %s" % (SEPARATOR, file, status_str, SEPARATOR))
+    if not passed and stdout:
+        print(stdout)
+        print("%s\n" % END_SEPARATOR)
+        return stdout.splitlines()
+    return []
 
 
 def run_clang_tidy(cmd, args):
-    command, is_cuda = get_tidy_args(cmd, args.exe)
-    tidy_cmd = [args.exe, "-header-filter=.*raft/cpp/.*", cmd["file"], "--", ]
+    command, is_cuda = get_tidy_args(cmd, args)
+    header_path_any = os.path.join(os.path.basename(args.root), "cpp", ".*")
+    header_filter = "-header-filter='.*%s[.](cuh|h|hpp)$'" % header_path_any
+    tidy_cmd = [args.exe, header_filter, cmd["file"], "--"]
     tidy_cmd.extend(command)
     status = True
     out = ""
     if is_cuda:
         tidy_cmd.append("--cuda-device-only")
         tidy_cmd.append(cmd["file"])
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        ret, out1, errors1 = run_clang_tidy_command(tidy_cmd, args.build_dir)
         out += out1
-        out += "%s" % SEPARATOR
-        if not ret:
-            status = ret
+        out += "\n%s\n" % SEPARATOR
+        status = status and ret
         tidy_cmd[-2] = "--cuda-host-only"
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        if not ret:
-            status = ret
+        ret, out1, errors2 = run_clang_tidy_command(tidy_cmd, args.build_dir)
+        status = status and ret
         out += out1
+        errors = errors1 + errors2
     else:
         tidy_cmd.append(cmd["file"])
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        if not ret:
-            status = ret
+        ret, out1, errors = run_clang_tidy_command(tidy_cmd, args.build_dir)
+        status = status and ret
         out += out1
-    return status, out, cmd["file"]
-
-
-# yikes! global var :(
-results = []
-def collect_result(result):
-    global results
-    results.append(result)
-
+    # we immediately print the result since this is more interactive for user
+    with lock:
+        lines = print_result(status, out, cmd["file"], errors)
+        return status, lines
 
-def print_result(passed, stdout, file):
-    status_str = "PASSED" if passed else "FAILED"
-    print("%s File:%s %s %s" % (SEPARATOR, file, status_str, SEPARATOR))
-    if stdout:
-        print(stdout)
-        print("%s File:%s ENDS %s" % (SEPARATOR, file, SEPARATOR))
 
-
-def print_results():
-    global results
-    status = True
-    for passed, stdout, file in results:
-        print_result(passed, stdout, file)
-        if not passed:
-            status = False
-    return status
+def parse_results(results):
+    return all(r[0] for r in results), [s for r in results for s in r[1]]
 
 
 # mostly used for debugging purposes
 def run_sequential(args, all_files):
-    status = True
+    # lock must be defined as in `run_parallel`
+    global lock
+    lock = LockContext()
+    results = []
     # actual tidy checker
     for cmd in all_files:
         # skip files that we don't want to look at
@@ -221,15 +371,22 @@ def run_sequential(args, all_files):
         if args.select_compiled is not None and \
            re.search(args.select_compiled, cmd["file"]) is None:
             continue
-        passed, stdout, file = run_clang_tidy(cmd, args)
-        print_result(passed, stdout, file)
-        if not passed:
-            status = False
-    return status
+        results.append(run_clang_tidy(cmd, args))
+    return parse_results(results)
+
+
+def copy_lock(init_lock):
+    # this is required to pass locks to pool workers
+    # see https://stackoverflow.com/questions/25557686/
+    # python-sharing-a-lock-between-processes
+    global lock
+    lock = init_lock
 
 
 def run_parallel(args, all_files):
-    pool = mp.Pool(args.j)
+    init_lock = LockContext(mp.Lock())
+    pool = mp.Pool(args.j, initializer=copy_lock, initargs=(init_lock,))
+    results = []
     # actual tidy checker
     for cmd in all_files:
         # skip files that we don't want to look at
@@ -239,11 +396,11 @@ def run_parallel(args, all_files):
         if args.select_compiled is not None and \
            re.search(args.select_compiled, cmd["file"]) is None:
             continue
-        pool.apply_async(run_clang_tidy, args=(cmd, args),
-                         callback=collect_result)
+        results.append(pool.apply_async(run_clang_tidy, args=(cmd, args)))
+    results_final = [r.get() for r in results]
     pool.close()
     pool.join()
-    return print_results()
+    return parse_results(results_final)
 
 
 def main():
@@ -252,11 +409,33 @@ def main():
     if not os.path.exists(".git"):
         raise Exception("This needs to always be run from the root of repo")
     all_files = list_all_cmds(args.cdb)
+    # ensure that we use only the real paths
+    for cmd in all_files:
+        cmd["file"] = os.path.realpath(os.path.expanduser(cmd["file"]))
     if args.j == 1:
-        status = run_sequential(args, all_files)
+        status, lines = run_sequential(args, all_files)
     else:
-        status = run_parallel(args, all_files)
+        status, lines = run_parallel(args, all_files)
     if not status:
+        # first get a list of all checks that were run
+        ret = subprocess.check_output(args.exe + " --list-checks", shell=True)
+        ret = ret.decode("utf-8")
+        checks = [line.strip() for line in ret.splitlines()
+                  if line.startswith(' ' * 4)]
+        max_check_len = max(len(c) for c in checks)
+        check_counts = dict()
+        content = os.linesep.join(lines)
+        for check in checks:
+            check_counts[check] = content.count(check)
+        sorted_counts = sorted(
+            check_counts.items(), key=lambda x: x[1], reverse=True)
+        print("Failed {} check(s) in total. Counts as per below:".format(
+            sum(1 for _, count in sorted_counts if count > 0)))
+        for check, count in sorted_counts:
+            if count <= 0:
+                break
+            n_space = max_check_len - len(check) + 4
+            print("{}:{}{}".format(check, ' ' * n_space, count))
         raise Exception("clang-tidy failed! Refer to the errors above.")