From 8c75178c1fc3db07652a235d83726dd9576d4565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5rten=20Bj=C3=B6rkman?= Date: Fri, 17 May 2019 00:39:51 +0200 Subject: [PATCH] MatchSiftData optimised for 1000+ features --- matching.cu | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 2 deletions(-) diff --git a/matching.cu b/matching.cu index e6730af..3dca6a0 100644 --- a/matching.cu +++ b/matching.cu @@ -292,8 +292,110 @@ __global__ void CleanMatches(SiftPoint *sift1, int numPts1) sift1[p1].score = 0.0f; } -__device__ volatile int lock = 0; +#define M7W 32 +#define M7H 32 +#define M7R 4 +#define NRX 2 +#define NDIM 128 + +__global__ void FindMaxCorr10(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2) +{ + __shared__ float4 buffer1[M7W*NDIM/4]; + __shared__ float4 buffer2[M7H*NDIM/4]; + int tx = threadIdx.x; + int ty = threadIdx.y; + int bp1 = M7W*blockIdx.x; + for (int j=ty;jmax_score[i]) { + sec_score[i] = max_score[i]; + max_score[i] = score[dy][i]; + index[i] = min(bp2 + M7R*iy + dy, numPts2-1); + } else if (score[dy][i]>sec_score[i]) + sec_score[i] = score[dy][i]; + } + } + } + __syncthreads(); + } + + float *scores1 = (float*)buffer1; + float *scores2 = &scores1[M7W*M7H/M7R]; + int *indices = (int*)&scores2[M7W*M7H/M7R]; + if (idxmax_score) { + sec_score = max(max_score, sec_score); + max_score = scores1[y*M7W + tx]; + index = indices[y*M7W + tx]; + } else if (scores1[y*M7W + tx]>sec_score) + sec_score = scores1[y*M7W + tx]; + } + sift1[bp1 + tx].score = max_score; + sift1[bp1 + tx].match = index; + sift1[bp1 + tx].match_xpos = sift2[index].xpos; + sift1[bp1 + tx].match_ypos = sift2[index].ypos; + sift1[bp1 + tx].ambiguity = sec_score / (max_score + 1e-6f); + } +} + #define FMC_GH 512 #define FMC_BW 32 #define FMC_BH 32 @@ -304,6 +406,7 @@ __device__ volatile int lock = 0; #define FMC_NH (FMC_BH/FMC_TH) // 8 #define FMC_NT (FMC_NW*FMC_NH) // 256 = 8 warps +__device__ volatile int lock = 0; __global__ void FindMaxCorr9(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2) { @@ -1064,7 +1167,7 @@ double MatchSiftData(SiftData &data1, SiftData &data2) dim3 blocksMax3(iDivUp(numPts1, 16), iDivUp(numPts2, 512)); dim3 threadsMax3(16, 16); CleanMatches<<>>(sift1, numPts1); - int mode = 9; + int mode = 10; if (mode==5)// K40c 5.0ms, 1080 Ti 1.2ms, 2080 Ti 0.83ms FindMaxCorr5<<>>(sift1, sift2, numPts1, numPts2); else if (mode==6) { // 2080 Ti 0.89ms @@ -1080,6 +1183,10 @@ double MatchSiftData(SiftData &data1, SiftData &data2) blocksMax3 = dim3(iDivUp(numPts1, FMC_BW), iDivUp(numPts2, FMC_GH)); threadsMax3 = dim3(FMC_NW, FMC_NH); FindMaxCorr9<<>>(sift1, sift2, numPts1, numPts2); + } else if (mode==10) { // 2080 Ti 0.24ms + blocksMax3 = dim3(iDivUp(numPts1, M7W)); + threadsMax3 = dim3(M7W, M7H/M7R); + FindMaxCorr10<<>>(sift1, sift2, numPts1, numPts2); } safeCall(cudaDeviceSynchronize()); checkMsg("FindMaxCorr5() execution failed\n");