Răsfoiți Sursa

完善代码

leon 5 luni în urmă
părinte
comite
d36fc1681f
1 a modificat fișierele cu 62 adăugiri și 5 ștergeri
  1. 62 5
      src/resnet.cu

+ 62 - 5
src/resnet.cu

@@ -218,9 +218,58 @@ struct AffineMatrix {
 };
 
 
-static __global__ void classfier(float *predict, cudaStream_t stream)
+
+static __global__ void softmax(float *predict, int length)
 {
+    extern __shared__ float shared_data[];
+    int tid = threadIdx.x;
+
+    // 1. 找到最大值,存储在共享内存中
+    float max_val = -FLT_MAX;
+    for (int i = tid; i < length; i += blockDim.x) {
+        max_val = max(max_val, data[i]);
+    }
+    shared_data[tid] = max_val;
+    __syncthreads();
+
+    // 在所有线程间找到全局最大值
+    if (tid == 0) {
+        for (int i = 1; i < blockDim.x; i++) {
+            shared_data[0] = max(shared_data[0], shared_data[i]);
+        }
+    }
+    __syncthreads();
+    max_val = shared_data[0];
+
+    // 2. 计算指数并求和
+    float sum_exp = 0.0f;
+    for (int i = tid; i < length; i += blockDim.x) {
+        data[i] = expf(data[i] - max_val);
+        sum_exp += data[i];
+    }
+    shared_data[tid] = sum_exp;
+    __syncthreads();
+
+    // 汇总所有线程的指数和
+    if (tid == 0) {
+        for (int i = 1; i < blockDim.x; i++) {
+            shared_data[0] += shared_data[i];
+        }
+    }
+    __syncthreads();
+    float total_sum = shared_data[0];
 
+    // 3. 每个元素除以总和,得到 softmax 值
+    for (int i = tid; i < length; i += blockDim.x) {
+        data[i] /= total_sum;
+    }
+}
+
+static void classfier_softmax(float *predict,int length, cudaStream_t stream) {
+  dim3 grid((dst_width + 31) / 32, (dst_height + 31) / 32);
+  dim3 block(32, 32);
+
+  checkKernel(softmax<<<grid, block, 0, stream>>>(predict, length));
 }
 
 class InferImpl : public Infer {
@@ -342,15 +391,23 @@ class InferImpl : public Infer {
       return {};
     }
 
+    for (int ib = 0; ib < num_image; ++ib) {
+      float *output_array_device = output_array_.gpu() + ib * num_classes_;
+      checkRuntime(classfier_softmax(output_array_device, num_classes_));
+    }
+
     
     checkRuntime(cudaMemcpyAsync(output_array_.cpu(), output_array_.gpu(),
                                  output_array_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
     checkRuntime(cudaStreamSynchronize(stream_));
 
-    printf("size : %d\n", output_array_.cpu_size());
-    // for (int ib = 0; ib < num_image; ++ib) {
-      
-    // }
+    for (int ib = 0; ib < num_image; ++ib) {
+      float *output_array_cpu = output_array_.cpu() + ib * num_classes_;
+      for (int i = 0; i < num_classes_; i++)
+      {
+        printf("prob : %f\t", *(output_array_cpu+i))
+      }
+    }
 
     vector<Attribute> arrout(num_image);