|
@@ -218,9 +218,58 @@ struct AffineMatrix {
|
|
};
|
|
};
|
|
|
|
|
|
|
|
|
|
-static __global__ void classfier(float *predict, cudaStream_t stream)
|
|
|
|
|
|
+
|
|
|
|
+static __global__ void softmax(float *predict, int length)
|
|
{
|
|
{
|
|
|
|
+ extern __shared__ float shared_data[];
|
|
|
|
+ int tid = threadIdx.x;
|
|
|
|
+
|
|
|
|
+ // 1. 找到最大值,存储在共享内存中
|
|
|
|
+ float max_val = -FLT_MAX;
|
|
|
|
+ for (int i = tid; i < length; i += blockDim.x) {
|
|
|
|
+ max_val = max(max_val, data[i]);
|
|
|
|
+ }
|
|
|
|
+ shared_data[tid] = max_val;
|
|
|
|
+ __syncthreads();
|
|
|
|
+
|
|
|
|
+ // 在所有线程间找到全局最大值
|
|
|
|
+ if (tid == 0) {
|
|
|
|
+ for (int i = 1; i < blockDim.x; i++) {
|
|
|
|
+ shared_data[0] = max(shared_data[0], shared_data[i]);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ __syncthreads();
|
|
|
|
+ max_val = shared_data[0];
|
|
|
|
+
|
|
|
|
+ // 2. 计算指数并求和
|
|
|
|
+ float sum_exp = 0.0f;
|
|
|
|
+ for (int i = tid; i < length; i += blockDim.x) {
|
|
|
|
+ data[i] = expf(data[i] - max_val);
|
|
|
|
+ sum_exp += data[i];
|
|
|
|
+ }
|
|
|
|
+ shared_data[tid] = sum_exp;
|
|
|
|
+ __syncthreads();
|
|
|
|
+
|
|
|
|
+ // 汇总所有线程的指数和
|
|
|
|
+ if (tid == 0) {
|
|
|
|
+ for (int i = 1; i < blockDim.x; i++) {
|
|
|
|
+ shared_data[0] += shared_data[i];
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ __syncthreads();
|
|
|
|
+ float total_sum = shared_data[0];
|
|
|
|
|
|
|
|
+ // 3. 每个元素除以总和,得到 softmax 值
|
|
|
|
+ for (int i = tid; i < length; i += blockDim.x) {
|
|
|
|
+ data[i] /= total_sum;
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void classfier_softmax(float *predict,int length, cudaStream_t stream) {
|
|
|
|
+ dim3 grid((dst_width + 31) / 32, (dst_height + 31) / 32);
|
|
|
|
+ dim3 block(32, 32);
|
|
|
|
+
|
|
|
|
+ checkKernel(softmax<<<grid, block, 0, stream>>>(predict, length));
|
|
}
|
|
}
|
|
|
|
|
|
class InferImpl : public Infer {
|
|
class InferImpl : public Infer {
|
|
@@ -342,15 +391,23 @@ class InferImpl : public Infer {
|
|
return {};
|
|
return {};
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ for (int ib = 0; ib < num_image; ++ib) {
|
|
|
|
+ float *output_array_device = output_array_.gpu() + ib * num_classes_;
|
|
|
|
+ checkRuntime(classfier_softmax(output_array_device, num_classes_));
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
|
|
checkRuntime(cudaMemcpyAsync(output_array_.cpu(), output_array_.gpu(),
|
|
checkRuntime(cudaMemcpyAsync(output_array_.cpu(), output_array_.gpu(),
|
|
output_array_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
|
|
output_array_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
|
|
checkRuntime(cudaStreamSynchronize(stream_));
|
|
checkRuntime(cudaStreamSynchronize(stream_));
|
|
|
|
|
|
- printf("size : %d\n", output_array_.cpu_size());
|
|
|
|
- // for (int ib = 0; ib < num_image; ++ib) {
|
|
|
|
-
|
|
|
|
- // }
|
|
|
|
|
|
+ for (int ib = 0; ib < num_image; ++ib) {
|
|
|
|
+ float *output_array_cpu = output_array_.cpu() + ib * num_classes_;
|
|
|
|
+ for (int i = 0; i < num_classes_; i++)
|
|
|
|
+ {
|
|
|
|
+ printf("prob : %f\t", *(output_array_cpu+i))
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
|
|
vector<Attribute> arrout(num_image);
|
|
vector<Attribute> arrout(num_image);
|
|
|
|
|