-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathyolo_kernel.cu
85 lines (68 loc) · 3.47 KB
/
yolo_kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#include "yolo_kernel.h"
#include <iostream>
namespace yolo {
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
static const int YOLO_CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
static inline int YOLO_GET_BLOCKS(const int N) {
return (N + YOLO_CUDA_NUM_THREADS - 1) / YOLO_CUDA_NUM_THREADS;
}
__device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
template <typename Dtype>
__global__ void CalDetection(const int num, const int w, const int h, const int priorn, int net_w, int net_h, const int classsn, const float objThresh
, const Dtype *input, const float *anchors, float *norm_box) {
CUDA_KERNEL_LOOP(index, num) {
int pix = w * h;
int _c = index / pix;
int _h = (index % pix) / w;
int _w = (index % pix) % w;
int box_index = _c * pix + _h * w + _w;
int elem_size = 5 + classsn;
const Dtype* cur_box = input + box_index * elem_size;
//get box prob
float box_prob = Logist(cur_box[4]);
if (box_prob < objThresh) continue;
//get box class
int class_id = -1;
float score = 0.01;
for (int i = 5; i < elem_size; ++i) {
float p = Logist(cur_box[i]) * box_prob;
if (p > score) {
score = p;
class_id = i - 5;
}
}
if (class_id >= 0) {
//get box location
float box_x = (_w + Logist(cur_box[0]) * 2 - 0.5f) * net_w / w;
float box_y = (_h + Logist(cur_box[1]) * 2 - 0.5f) * net_h / h;
float box_w = powf(Logist(cur_box[2]) * 2, 2) * anchors[2 * _c];
float box_h = powf(Logist(cur_box[3]) * 2, 2) * anchors[2 * _c + 1];
norm_box[box_index * 6 + 0] = (box_x - box_w / 2) / net_w;
norm_box[box_index * 6 + 1] = (box_y - box_h / 2) / net_h;
norm_box[box_index * 6 + 2] = (box_x + box_w / 2) / net_w;
norm_box[box_index * 6 + 3] = (box_y + box_h / 2) / net_h;
norm_box[box_index * 6 + 4] = class_id;
norm_box[box_index * 6 + 5] = score;
//printf("box id=%d, xmin=%.02f, ymin=%.02f, xmax=%.02f, ymax=%.02f, clas=%f, score=%.06f, score0=%.06f, score1=%.06f, score2=%.06f, boxprob=%.06f\n", box_index
// , norm_box[box_index * 6 + 0], norm_box[box_index * 6 + 1]
// , norm_box[box_index * 6 + 2], norm_box[box_index * 6 + 3]
// , norm_box[box_index * 6 + 4], norm_box[box_index * 6 + 5]
// , Logist(cur_box[5]), Logist(cur_box[6]), Logist(cur_box[7])
// , box_prob);
}
}
}
template <typename Dtype>
void yolo_detection(int w, int h, int priorn, int classn, float objThresh, int net_w, int net_h
, Dtype *input, Dtype *anchors, float *norm_box, cudaStream_t ss) {
int num = w * h * priorn;
CalDetection << <YOLO_GET_BLOCKS(num), YOLO_CUDA_NUM_THREADS, 0, ss >> >
(num, w, h, priorn, net_w, net_h, classn, objThresh, input, anchors, norm_box);
}
template void yolo_detection(int w, int h, int priorn, int classn, float objThresh, int net_w, int net_h
, float *input, float *anchors, float *norm_box, cudaStream_t ss);
}