|
@@ -187,6 +187,7 @@ struct AffineMatrix {
|
|
|
float scale_x = get<0>(to) / (float)get<0>(from);
|
|
|
float scale_y = get<1>(to) / (float)get<1>(from);
|
|
|
float scale = std::min(scale_x, scale_y);
|
|
|
+
|
|
|
// letter box
|
|
|
// i2d[0] = scale;
|
|
|
// i2d[1] = 0;
|
|
@@ -194,6 +195,7 @@ struct AffineMatrix {
|
|
|
// i2d[3] = 0;
|
|
|
// i2d[4] = scale;
|
|
|
// i2d[5] = -scale * get<1>(from) * 0.5 + get<1>(to) * 0.5 + scale * 0.5 - 0.5;
|
|
|
+
|
|
|
// resize
|
|
|
i2d[0] = scale;
|
|
|
i2d[1] = 0;
|
|
@@ -220,27 +222,39 @@ struct AffineMatrix {
|
|
|
|
|
|
|
|
|
|
|
|
-static __global__ void softmax(float *predict, int length)
|
|
|
-{
|
|
|
+static __global__ void softmax(float *predict, int length, int *max_index) {
|
|
|
extern __shared__ float shared_data[];
|
|
|
+ float *shared_max_vals = shared_data;
|
|
|
+ int *shared_max_indices = (int*)&shared_max_vals[blockDim.x];
|
|
|
+
|
|
|
int tid = threadIdx.x;
|
|
|
|
|
|
- // 1. 找到最大值,存储在共享内存中
|
|
|
+ // 1. 找到最大值和最大值的下标,存储在共享内存中
|
|
|
float max_val = -FLT_MAX;
|
|
|
+ int max_idx = -1;
|
|
|
for (int i = tid; i < length; i += blockDim.x) {
|
|
|
- max_val = max(max_val, predict[i]);
|
|
|
+ if (predict[i] > max_val) {
|
|
|
+ max_val = predict[i];
|
|
|
+ max_idx = i;
|
|
|
+ }
|
|
|
}
|
|
|
- shared_data[tid] = max_val;
|
|
|
+ shared_max_vals[tid] = max_val;
|
|
|
+ shared_max_indices[tid] = max_idx;
|
|
|
__syncthreads();
|
|
|
|
|
|
- // 在所有线程间找到全局最大值
|
|
|
+ // 在所有线程间找到全局最大值和对应的下标
|
|
|
if (tid == 0) {
|
|
|
for (int i = 1; i < blockDim.x; i++) {
|
|
|
- shared_data[0] = max(shared_data[0], shared_data[i]);
|
|
|
+ if (shared_max_vals[i] > shared_max_vals[0]) {
|
|
|
+ shared_max_vals[0] = shared_max_vals[i];
|
|
|
+ shared_max_indices[0] = shared_max_indices[i];
|
|
|
+ }
|
|
|
}
|
|
|
+ *max_index = shared_max_indices[0];
|
|
|
}
|
|
|
__syncthreads();
|
|
|
- max_val = shared_data[0];
|
|
|
+
|
|
|
+ max_val = shared_max_vals[0];
|
|
|
|
|
|
// 2. 计算指数并求和
|
|
|
float sum_exp = 0.0f;
|
|
@@ -248,17 +262,17 @@ static __global__ void softmax(float *predict, int length)
|
|
|
predict[i] = expf(predict[i] - max_val);
|
|
|
sum_exp += predict[i];
|
|
|
}
|
|
|
- shared_data[tid] = sum_exp;
|
|
|
+ shared_max_vals[tid] = sum_exp;
|
|
|
__syncthreads();
|
|
|
|
|
|
// 汇总所有线程的指数和
|
|
|
if (tid == 0) {
|
|
|
for (int i = 1; i < blockDim.x; i++) {
|
|
|
- shared_data[0] += shared_data[i];
|
|
|
+ shared_max_vals[0] += shared_max_vals[i];
|
|
|
}
|
|
|
}
|
|
|
__syncthreads();
|
|
|
- float total_sum = shared_data[0];
|
|
|
+ float total_sum = shared_max_vals[0];
|
|
|
|
|
|
// 3. 每个元素除以总和,得到 softmax 值
|
|
|
for (int i = tid; i < length; i += blockDim.x) {
|
|
@@ -266,9 +280,9 @@ static __global__ void softmax(float *predict, int length)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-static void classfier_softmax(float *predict, int length, cudaStream_t stream) {
|
|
|
+static void classfier_softmax(float *predict, int length, int *max_index, cudaStream_t stream) {
|
|
|
int block_size = 256;
|
|
|
- checkKernel(softmax<<<1, block_size, block_size * sizeof(float), stream>>>(predict, length));
|
|
|
+ checkKernel(softmax<<<1, block_size, block_size * sizeof(float), stream>>>(predict, length, max_index));
|
|
|
}
|
|
|
|
|
|
class InferImpl : public Infer {
|
|
@@ -277,6 +291,7 @@ class InferImpl : public Infer {
|
|
|
string engine_file_;
|
|
|
vector<shared_ptr<trt::Memory<unsigned char>>> preprocess_buffers_;
|
|
|
trt::Memory<float> input_buffer_, output_array_;
|
|
|
+ trt::Memory<float> classes_indexes_;
|
|
|
int network_input_width_, network_input_height_;
|
|
|
Norm normalize_;
|
|
|
int num_classes_ = 0;
|
|
@@ -290,6 +305,8 @@ class InferImpl : public Infer {
|
|
|
input_buffer_.gpu(batch_size * input_numel);
|
|
|
output_array_.gpu(batch_size * num_classes_);
|
|
|
output_array_.cpu(batch_size * num_classes_);
|
|
|
+ classes_indices_.gpu(batch_size)
|
|
|
+ classes_indices_.cpu(batch_size)
|
|
|
|
|
|
|
|
|
if ((int)preprocess_buffers_.size() < batch_size) {
|
|
@@ -396,12 +413,15 @@ class InferImpl : public Infer {
|
|
|
|
|
|
for (int ib = 0; ib < num_image; ++ib) {
|
|
|
float *output_array_device = output_array_.gpu() + ib * num_classes_;
|
|
|
- classfier_softmax(output_array_device, num_classes_, stream_);
|
|
|
+ float *classes_indices_device = classes_indices_.gpu() + ib;
|
|
|
+ classfier_softmax(output_array_device, num_classes_, stream_, classes_indices_device);
|
|
|
}
|
|
|
|
|
|
|
|
|
checkRuntime(cudaMemcpyAsync(output_array_.cpu(), output_array_.gpu(),
|
|
|
output_array_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
|
|
|
+ checkRuntime(cudaMemcpyAsync(classes_indices_.cpu(), classes_indices_.gpu(),
|
|
|
+ classes_indices_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
|
|
|
checkRuntime(cudaStreamSynchronize(stream_));
|
|
|
|
|
|
vector<Attribute> arrout;
|
|
@@ -409,16 +429,9 @@ class InferImpl : public Infer {
|
|
|
|
|
|
for (int ib = 0; ib < num_image; ++ib) {
|
|
|
float *output_array_cpu = output_array_.cpu() + ib * num_classes_;
|
|
|
- float max_score = 0.f;
|
|
|
- int index = -1;
|
|
|
- for (int i = 0; i < num_classes_; i++)
|
|
|
- {
|
|
|
- if (*(output_array_cpu+i) > max_score)
|
|
|
- {
|
|
|
- index = i;
|
|
|
- max_score = *(output_array_cpu+i);
|
|
|
- }
|
|
|
- }
|
|
|
+ int *max_index = classes_indices_.cpu() + ib;
|
|
|
+ int index = *max_index;
|
|
|
+ float max_score = output_array_cpu[index];
|
|
|
arrout.emplace_back(max_score, index);
|
|
|
}
|
|
|
for (int ib = 0; ib < num_image; ++ib) {
|