10 luni în urmă · d36fc1681f
--- a/src/resnet.cu
+++ b/src/resnet.cu
@@ -218,9 +218,58 @@ struct AffineMatrix {
 
				 };
			
 
				 
			
 
				 
			
 
				-static __global__ void classfier(float *predict, cudaStream_t stream)
			
 
				+
			
 
				+static __global__ void softmax(float *predict, int length)
			
 
				 {
			
 
				+    extern __shared__ float shared_data[];
			
 
				+    int tid = threadIdx.x;
			
 
				+
			
 
				+    // 1. 找到最大值，存储在共享内存中
			
 
				+    float max_val = -FLT_MAX;
			
 
				+    for (int i = tid; i < length; i += blockDim.x) {
			
 
				+        max_val = max(max_val, data[i]);
			
 
				+    }
			
 
				+    shared_data[tid] = max_val;
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    // 在所有线程间找到全局最大值
			
 
				+    if (tid == 0) {
			
 
				+        for (int i = 1; i < blockDim.x; i++) {
			
 
				+            shared_data[0] = max(shared_data[0], shared_data[i]);
			
 
				+        }
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+    max_val = shared_data[0];
			
 
				+
			
 
				+    // 2. 计算指数并求和
			
 
				+    float sum_exp = 0.0f;
			
 
				+    for (int i = tid; i < length; i += blockDim.x) {
			
 
				+        data[i] = expf(data[i] - max_val);
			
 
				+        sum_exp += data[i];
			
 
				+    }
			
 
				+    shared_data[tid] = sum_exp;
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    // 汇总所有线程的指数和
			
 
				+    if (tid == 0) {
			
 
				+        for (int i = 1; i < blockDim.x; i++) {
			
 
				+            shared_data[0] += shared_data[i];
			
 
				+        }
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+    float total_sum = shared_data[0];
			
 
				 
			
 
				+    // 3. 每个元素除以总和，得到 softmax 值
			
 
				+    for (int i = tid; i < length; i += blockDim.x) {
			
 
				+        data[i] /= total_sum;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void classfier_softmax(float *predict,int length, cudaStream_t stream) {
			
 
				+  dim3 grid((dst_width + 31) / 32, (dst_height + 31) / 32);
			
 
				+  dim3 block(32, 32);
			
 
				+
			
 
				+  checkKernel(softmax<<<grid, block, 0, stream>>>(predict, length));
			
 
				 }
			
 
				 
			
 
				 class InferImpl : public Infer {
			
@@ -342,15 +391,23 @@ class InferImpl : public Infer {
 
				       return {};
			
 
				     }
			
 
				 
			
 
				+    for (int ib = 0; ib < num_image; ++ib) {
			
 
				+      float *output_array_device = output_array_.gpu() + ib * num_classes_;
			
 
				+      checkRuntime(classfier_softmax(output_array_device, num_classes_));
			
 
				+    }
			
 
				+
			
 
				     
			
 
				     checkRuntime(cudaMemcpyAsync(output_array_.cpu(), output_array_.gpu(),
			
 
				                                  output_array_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
			
 
				     checkRuntime(cudaStreamSynchronize(stream_));
			
 
				 
			
 
				-    printf("size : %d\n", output_array_.cpu_size());
			
 
				-    // for (int ib = 0; ib < num_image; ++ib) {
			
 
				-      
			
 
				-    // }
			
 
				+    for (int ib = 0; ib < num_image; ++ib) {
			
 
				+      float *output_array_cpu = output_array_.cpu() + ib * num_classes_;
			
 
				+      for (int i = 0; i < num_classes_; i++)
			
 
				+      {
			
 
				+        printf("prob : %f\t", *(output_array_cpu+i))
			
 
				+      }
			
 
				+    }
			
 
				 
			
 
				     vector<Attribute> arrout(num_image);