Skip to content

Commit

Permalink
MatchSiftData optimised for 1000+ features
Browse files Browse the repository at this point in the history
  • Loading branch information
Mårten Björkman committed May 16, 2019
1 parent f3a6fac commit 8c75178
Showing 1 changed file with 109 additions and 2 deletions.
111 changes: 109 additions & 2 deletions matching.cu
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,110 @@ __global__ void CleanMatches(SiftPoint *sift1, int numPts1)
sift1[p1].score = 0.0f;
}

__device__ volatile int lock = 0;
#define M7W 32
#define M7H 32
#define M7R 4
#define NRX 2
#define NDIM 128

__global__ void FindMaxCorr10(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
{
__shared__ float4 buffer1[M7W*NDIM/4];
__shared__ float4 buffer2[M7H*NDIM/4];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bp1 = M7W*blockIdx.x;
for (int j=ty;j<M7W;j+=M7H/M7R) {
int p1 = min(bp1 + j, numPts1 - 1);
for (int d=tx;d<NDIM/4;d+=M7W)
buffer1[j*NDIM/4 + (d + j)%(NDIM/4)] = ((float4*)&sift1[p1].data)[d];
}

float max_score[NRX];
float sec_score[NRX];
int index[NRX];
for (int i=0;i<NRX;i++) {
max_score[i] = 0.0f;
sec_score[i] = 0.0f;
index[i] = -1;
}
int idx = ty*M7W + tx;
int ix = idx%(M7W/NRX);
int iy = idx/(M7W/NRX);
for (int bp2=0;bp2<numPts2 - M7H + 1;bp2+=M7H) {
for (int j=ty;j<M7H;j+=M7H/M7R) {
int p2 = min(bp2 + j, numPts2 - 1);
for (int d=tx;d<NDIM/4;d+=M7W)
buffer2[j*NDIM/4 + d] = ((float4*)&sift2[p2].data)[d];
}
__syncthreads();

if (idx<M7W*M7H/M7R/NRX) {
float score[M7R][NRX];
for (int dy=0;dy<M7R;dy++)
for (int i=0;i<NRX;i++)
score[dy][i] = 0.0f;
for (int d=0;d<NDIM/4;d++) {
float4 v1[NRX];
for (int i=0;i<NRX;i++)
v1[i] = buffer1[((M7W/NRX)*i + ix)*NDIM/4 + (d + (M7W/NRX)*i + ix)%(NDIM/4)];
for (int dy=0;dy<M7R;dy++) {
float4 v2 = buffer2[(M7R*iy + dy)*(NDIM/4) + d];
for (int i=0;i<NRX;i++) {
score[dy][i] += v1[i].x*v2.x;
score[dy][i] += v1[i].y*v2.y;
score[dy][i] += v1[i].z*v2.z;
score[dy][i] += v1[i].w*v2.w;
}
}
}
for (int dy=0;dy<M7R;dy++) {
for (int i=0;i<NRX;i++) {
if (score[dy][i]>max_score[i]) {
sec_score[i] = max_score[i];
max_score[i] = score[dy][i];
index[i] = min(bp2 + M7R*iy + dy, numPts2-1);
} else if (score[dy][i]>sec_score[i])
sec_score[i] = score[dy][i];
}
}
}
__syncthreads();
}

float *scores1 = (float*)buffer1;
float *scores2 = &scores1[M7W*M7H/M7R];
int *indices = (int*)&scores2[M7W*M7H/M7R];
if (idx<M7W*M7H/M7R/NRX) {
for (int i=0;i<NRX;i++) {
scores1[iy*M7W + (M7W/NRX)*i + ix] = max_score[i];
scores2[iy*M7W + (M7W/NRX)*i + ix] = sec_score[i];
indices[iy*M7W + (M7W/NRX)*i + ix] = index[i];
}
}
__syncthreads();

if (ty==0) {
float max_score = scores1[tx];
float sec_score = scores2[tx];
int index = indices[tx];
for (int y=0;y<M7H/M7R;y++)
if (index != indices[y*M7W + tx]) {
if (scores1[y*M7W + tx]>max_score) {
sec_score = max(max_score, sec_score);
max_score = scores1[y*M7W + tx];
index = indices[y*M7W + tx];
} else if (scores1[y*M7W + tx]>sec_score)
sec_score = scores1[y*M7W + tx];
}
sift1[bp1 + tx].score = max_score;
sift1[bp1 + tx].match = index;
sift1[bp1 + tx].match_xpos = sift2[index].xpos;
sift1[bp1 + tx].match_ypos = sift2[index].ypos;
sift1[bp1 + tx].ambiguity = sec_score / (max_score + 1e-6f);
}
}

#define FMC_GH 512
#define FMC_BW 32
#define FMC_BH 32
Expand All @@ -304,6 +406,7 @@ __device__ volatile int lock = 0;
#define FMC_NH (FMC_BH/FMC_TH) // 8
#define FMC_NT (FMC_NW*FMC_NH) // 256 = 8 warps

__device__ volatile int lock = 0;

__global__ void FindMaxCorr9(SiftPoint *sift1, SiftPoint *sift2, int numPts1, int numPts2)
{
Expand Down Expand Up @@ -1064,7 +1167,7 @@ double MatchSiftData(SiftData &data1, SiftData &data2)
dim3 blocksMax3(iDivUp(numPts1, 16), iDivUp(numPts2, 512));
dim3 threadsMax3(16, 16);
CleanMatches<<<iDivUp(numPts1, 64), 64>>>(sift1, numPts1);
int mode = 9;
int mode = 10;
if (mode==5)// K40c 5.0ms, 1080 Ti 1.2ms, 2080 Ti 0.83ms
FindMaxCorr5<<<blocksMax3, threadsMax3>>>(sift1, sift2, numPts1, numPts2);
else if (mode==6) { // 2080 Ti 0.89ms
Expand All @@ -1080,6 +1183,10 @@ double MatchSiftData(SiftData &data1, SiftData &data2)
blocksMax3 = dim3(iDivUp(numPts1, FMC_BW), iDivUp(numPts2, FMC_GH));
threadsMax3 = dim3(FMC_NW, FMC_NH);
FindMaxCorr9<<<blocksMax3, threadsMax3>>>(sift1, sift2, numPts1, numPts2);
} else if (mode==10) { // 2080 Ti 0.24ms
blocksMax3 = dim3(iDivUp(numPts1, M7W));
threadsMax3 = dim3(M7W, M7H/M7R);
FindMaxCorr10<<<blocksMax3, threadsMax3>>>(sift1, sift2, numPts1, numPts2);
}
safeCall(cudaDeviceSynchronize());
checkMsg("FindMaxCorr5() execution failed\n");
Expand Down

0 comments on commit 8c75178

Please sign in to comment.