hace 7 meses · c378314e61
--- a/src/resnet.cu
+++ b/src/resnet.cu
@@ -187,6 +187,7 @@ struct AffineMatrix {
 
				     float scale_x = get<0>(to) / (float)get<0>(from);
			
 
				     float scale_y = get<1>(to) / (float)get<1>(from);
			
 
				     float scale = std::min(scale_x, scale_y);
			
 
				+
			
 
				     // letter box
			
 
				     // i2d[0] = scale;
			
 
				     // i2d[1] = 0;
			
@@ -194,6 +195,7 @@ struct AffineMatrix {
 
				     // i2d[3] = 0;
			
 
				     // i2d[4] = scale;
			
 
				     // i2d[5] = -scale * get<1>(from) * 0.5 + get<1>(to) * 0.5 + scale * 0.5 - 0.5;
			
 
				+
			
 
				     // resize 
			
 
				     i2d[0] = scale;
			
 
				     i2d[1] = 0;
			
@@ -220,27 +222,39 @@ struct AffineMatrix {
 
				 
			
 
				 
			
 
				 
			
 
				-static __global__ void softmax(float *predict, int length)
			
 
				-{
			
 
				+static __global__ void softmax(float *predict, int length, int *max_index) {
			
 
				     extern __shared__ float shared_data[];
			
 
				+    float *shared_max_vals = shared_data;
			
 
				+    int *shared_max_indices = (int*)&shared_max_vals[blockDim.x];
			
 
				+    
			
 
				     int tid = threadIdx.x;
			
 
				 
			
 
				-    // 1. 找到最大值，存储在共享内存中
			
 
				+    // 1. 找到最大值和最大值的下标，存储在共享内存中
			
 
				     float max_val = -FLT_MAX;
			
 
				+    int max_idx = -1;
			
 
				     for (int i = tid; i < length; i += blockDim.x) {
			
 
				-        max_val = max(max_val, predict[i]);
			
 
				+        if (predict[i] > max_val) {
			
 
				+            max_val = predict[i];
			
 
				+            max_idx = i;
			
 
				+        }
			
 
				     }
			
 
				-    shared_data[tid] = max_val;
			
 
				+    shared_max_vals[tid] = max_val;
			
 
				+    shared_max_indices[tid] = max_idx;
			
 
				     __syncthreads();
			
 
				 
			
 
				-    // 在所有线程间找到全局最大值
			
 
				+    // 在所有线程间找到全局最大值和对应的下标
			
 
				     if (tid == 0) {
			
 
				         for (int i = 1; i < blockDim.x; i++) {
			
 
				-            shared_data[0] = max(shared_data[0], shared_data[i]);
			
 
				+            if (shared_max_vals[i] > shared_max_vals[0]) {
			
 
				+                shared_max_vals[0] = shared_max_vals[i];
			
 
				+                shared_max_indices[0] = shared_max_indices[i];
			
 
				+            }
			
 
				         }
			
 
				+        *max_index = shared_max_indices[0];
			
 
				     }
			
 
				     __syncthreads();
			
 
				-    max_val = shared_data[0];
			
 
				+
			
 
				+    max_val = shared_max_vals[0];
			
 
				 
			
 
				     // 2. 计算指数并求和
			
 
				     float sum_exp = 0.0f;
			
@@ -248,17 +262,17 @@ static __global__ void softmax(float *predict, int length)
 
				         predict[i] = expf(predict[i] - max_val);
			
 
				         sum_exp += predict[i];
			
 
				     }
			
 
				-    shared_data[tid] = sum_exp;
			
 
				+    shared_max_vals[tid] = sum_exp;
			
 
				     __syncthreads();
			
 
				 
			
 
				     // 汇总所有线程的指数和
			
 
				     if (tid == 0) {
			
 
				         for (int i = 1; i < blockDim.x; i++) {
			
 
				-            shared_data[0] += shared_data[i];
			
 
				+            shared_max_vals[0] += shared_max_vals[i];
			
 
				         }
			
 
				     }
			
 
				     __syncthreads();
			
 
				-    float total_sum = shared_data[0];
			
 
				+    float total_sum = shared_max_vals[0];
			
 
				 
			
 
				     // 3. 每个元素除以总和，得到 softmax 值
			
 
				     for (int i = tid; i < length; i += blockDim.x) {
			
@@ -266,9 +280,9 @@ static __global__ void softmax(float *predict, int length)
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void classfier_softmax(float *predict, int length, cudaStream_t stream) {
			
 
				+static void classfier_softmax(float *predict, int length, int *max_index, cudaStream_t stream) {
			
 
				   int block_size = 256;
			
 
				-  checkKernel(softmax<<<1, block_size, block_size * sizeof(float), stream>>>(predict, length));
			
 
				+  checkKernel(softmax<<<1, block_size, block_size * sizeof(float), stream>>>(predict, length, max_index));
			
 
				 }
			
 
				 
			
 
				 class InferImpl : public Infer {
			
@@ -277,6 +291,7 @@ class InferImpl : public Infer {
 
				   string engine_file_;
			
 
				   vector<shared_ptr<trt::Memory<unsigned char>>> preprocess_buffers_;
			
 
				   trt::Memory<float> input_buffer_, output_array_;
			
 
				+  trt::Memory<float> classes_indexes_;
			
 
				   int network_input_width_, network_input_height_;
			
 
				   Norm normalize_;
			
 
				   int num_classes_ = 0;
			
@@ -290,6 +305,8 @@ class InferImpl : public Infer {
 
				     input_buffer_.gpu(batch_size * input_numel);
			
 
				     output_array_.gpu(batch_size * num_classes_);
			
 
				     output_array_.cpu(batch_size * num_classes_);
			
 
				+    classes_indices_.gpu(batch_size)
			
 
				+    classes_indices_.cpu(batch_size)
			
 
				 
			
 
				 
			
 
				     if ((int)preprocess_buffers_.size() < batch_size) {
			
@@ -396,12 +413,15 @@ class InferImpl : public Infer {
 
				 
			
 
				     for (int ib = 0; ib < num_image; ++ib) {
			
 
				       float *output_array_device = output_array_.gpu() + ib * num_classes_;
			
 
				-      classfier_softmax(output_array_device, num_classes_, stream_);
			
 
				+      float *classes_indices_device = classes_indices_.gpu() + ib;
			
 
				+      classfier_softmax(output_array_device, num_classes_, stream_, classes_indices_device);
			
 
				     }
			
 
				 
			
 
				     
			
 
				     checkRuntime(cudaMemcpyAsync(output_array_.cpu(), output_array_.gpu(),
			
 
				                                  output_array_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
			
 
				+    checkRuntime(cudaMemcpyAsync(classes_indices_.cpu(), classes_indices_.gpu(),
			
 
				+                                 classes_indices_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
			
 
				     checkRuntime(cudaStreamSynchronize(stream_));
			
 
				 
			
 
				     vector<Attribute> arrout;
			
@@ -409,16 +429,9 @@ class InferImpl : public Infer {
 
				 
			
 
				     for (int ib = 0; ib < num_image; ++ib) {
			
 
				       float *output_array_cpu = output_array_.cpu() + ib * num_classes_;
			
 
				-      float max_score = 0.f;
			
 
				-      int index = -1;
			
 
				-      for (int i = 0; i < num_classes_; i++)
			
 
				-      {
			
 
				-        if (*(output_array_cpu+i) > max_score)
			
 
				-        {
			
 
				-            index = i;
			
 
				-            max_score = *(output_array_cpu+i);
			
 
				-        }
			
 
				-      }
			
 
				+      int *max_index = classes_indices_.cpu() + ib;
			
 
				+      int index = *max_index;
			
 
				+      float max_score = output_array_cpu[index];
			
 
				       arrout.emplace_back(max_score, index);
			
 
				     }
			
 
				     for (int ib = 0; ib < num_image; ++ib) {