Переглянути джерело

完善程序,直接从核函数中获取最大下标

leon 5 місяців тому
батько
коміт
c378314e61
1 змінених файлів з 37 додано та 24 видалено
  1. 37 24
      src/resnet.cu

+ 37 - 24
src/resnet.cu

@@ -187,6 +187,7 @@ struct AffineMatrix {
     float scale_x = get<0>(to) / (float)get<0>(from);
     float scale_y = get<1>(to) / (float)get<1>(from);
     float scale = std::min(scale_x, scale_y);
+
     // letter box
     // i2d[0] = scale;
     // i2d[1] = 0;
@@ -194,6 +195,7 @@ struct AffineMatrix {
     // i2d[3] = 0;
     // i2d[4] = scale;
     // i2d[5] = -scale * get<1>(from) * 0.5 + get<1>(to) * 0.5 + scale * 0.5 - 0.5;
+
     // resize 
     i2d[0] = scale;
     i2d[1] = 0;
@@ -220,27 +222,39 @@ struct AffineMatrix {
 
 
 
-static __global__ void softmax(float *predict, int length)
-{
+static __global__ void softmax(float *predict, int length, int *max_index) {
     extern __shared__ float shared_data[];
+    float *shared_max_vals = shared_data;
+    int *shared_max_indices = (int*)&shared_max_vals[blockDim.x];
+    
     int tid = threadIdx.x;
 
-    // 1. 找到最大值,存储在共享内存中
+    // 1. 找到最大值和最大值的下标,存储在共享内存中
     float max_val = -FLT_MAX;
+    int max_idx = -1;
     for (int i = tid; i < length; i += blockDim.x) {
-        max_val = max(max_val, predict[i]);
+        if (predict[i] > max_val) {
+            max_val = predict[i];
+            max_idx = i;
+        }
     }
-    shared_data[tid] = max_val;
+    shared_max_vals[tid] = max_val;
+    shared_max_indices[tid] = max_idx;
     __syncthreads();
 
-    // 在所有线程间找到全局最大值
+    // 在所有线程间找到全局最大值和对应的下标
     if (tid == 0) {
         for (int i = 1; i < blockDim.x; i++) {
-            shared_data[0] = max(shared_data[0], shared_data[i]);
+            if (shared_max_vals[i] > shared_max_vals[0]) {
+                shared_max_vals[0] = shared_max_vals[i];
+                shared_max_indices[0] = shared_max_indices[i];
+            }
         }
+        *max_index = shared_max_indices[0];
     }
     __syncthreads();
-    max_val = shared_data[0];
+
+    max_val = shared_max_vals[0];
 
     // 2. 计算指数并求和
     float sum_exp = 0.0f;
@@ -248,17 +262,17 @@ static __global__ void softmax(float *predict, int length)
         predict[i] = expf(predict[i] - max_val);
         sum_exp += predict[i];
     }
-    shared_data[tid] = sum_exp;
+    shared_max_vals[tid] = sum_exp;
     __syncthreads();
 
     // 汇总所有线程的指数和
     if (tid == 0) {
         for (int i = 1; i < blockDim.x; i++) {
-            shared_data[0] += shared_data[i];
+            shared_max_vals[0] += shared_max_vals[i];
         }
     }
     __syncthreads();
-    float total_sum = shared_data[0];
+    float total_sum = shared_max_vals[0];
 
     // 3. 每个元素除以总和,得到 softmax 值
     for (int i = tid; i < length; i += blockDim.x) {
@@ -266,9 +280,9 @@ static __global__ void softmax(float *predict, int length)
     }
 }
 
-static void classfier_softmax(float *predict, int length, cudaStream_t stream) {
+static void classfier_softmax(float *predict, int length, int *max_index, cudaStream_t stream) {
   int block_size = 256;
-  checkKernel(softmax<<<1, block_size, block_size * sizeof(float), stream>>>(predict, length));
+  checkKernel(softmax<<<1, block_size, block_size * sizeof(float), stream>>>(predict, length, max_index));
 }
 
 class InferImpl : public Infer {
@@ -277,6 +291,7 @@ class InferImpl : public Infer {
   string engine_file_;
   vector<shared_ptr<trt::Memory<unsigned char>>> preprocess_buffers_;
   trt::Memory<float> input_buffer_, output_array_;
+  trt::Memory<float> classes_indexes_;
   int network_input_width_, network_input_height_;
   Norm normalize_;
   int num_classes_ = 0;
@@ -290,6 +305,8 @@ class InferImpl : public Infer {
     input_buffer_.gpu(batch_size * input_numel);
     output_array_.gpu(batch_size * num_classes_);
     output_array_.cpu(batch_size * num_classes_);
+    classes_indices_.gpu(batch_size)
+    classes_indices_.cpu(batch_size)
 
 
     if ((int)preprocess_buffers_.size() < batch_size) {
@@ -396,12 +413,15 @@ class InferImpl : public Infer {
 
     for (int ib = 0; ib < num_image; ++ib) {
       float *output_array_device = output_array_.gpu() + ib * num_classes_;
-      classfier_softmax(output_array_device, num_classes_, stream_);
+      float *classes_indices_device = classes_indices_.gpu() + ib;
+      classfier_softmax(output_array_device, num_classes_, stream_, classes_indices_device);
     }
 
     
     checkRuntime(cudaMemcpyAsync(output_array_.cpu(), output_array_.gpu(),
                                  output_array_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
+    checkRuntime(cudaMemcpyAsync(classes_indices_.cpu(), classes_indices_.gpu(),
+                                 classes_indices_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
     checkRuntime(cudaStreamSynchronize(stream_));
 
     vector<Attribute> arrout;
@@ -409,16 +429,9 @@ class InferImpl : public Infer {
 
     for (int ib = 0; ib < num_image; ++ib) {
       float *output_array_cpu = output_array_.cpu() + ib * num_classes_;
-      float max_score = 0.f;
-      int index = -1;
-      for (int i = 0; i < num_classes_; i++)
-      {
-        if (*(output_array_cpu+i) > max_score)
-        {
-            index = i;
-            max_score = *(output_array_cpu+i);
-        }
-      }
+      int *max_index = classes_indices_.cpu() + ib;
+      int index = *max_index;
+      float max_score = output_array_cpu[index];
       arrout.emplace_back(max_score, index);
     }
     for (int ib = 0; ib < num_image; ++ib) {