#include "infer/trt/classfier/classifier.hpp" #include #include #include namespace cls { static __global__ void softmax(float *predict, int length, int *max_index) { extern __shared__ float shared_data[]; float *shared_max_vals = shared_data; int *shared_max_indices = (int*)&shared_max_vals[blockDim.x]; int tid = threadIdx.x; // 1. 找到最大值和最大值的下标，存储在共享内存中 float max_val = -FLT_MAX; int max_idx = -1; for (int i = tid; i < length; i += blockDim.x) { if (predict[i] > max_val) { max_val = predict[i]; max_idx = i; } } shared_max_vals[tid] = max_val; shared_max_indices[tid] = max_idx; __syncthreads(); // 在所有线程间找到全局最大值和对应的下标 if (tid == 0) { for (int i = 1; i < blockDim.x; i++) { if (shared_max_vals[i] > shared_max_vals[0]) { shared_max_vals[0] = shared_max_vals[i]; shared_max_indices[0] = shared_max_indices[i]; } } *max_index = shared_max_indices[0]; } __syncthreads(); max_val = shared_max_vals[0]; // 2. 计算指数并求和 float sum_exp = 0.0f; for (int i = tid; i < length; i += blockDim.x) { predict[i] = expf(predict[i] - max_val); sum_exp += predict[i]; } shared_max_vals[tid] = sum_exp; __syncthreads(); // 汇总所有线程的指数和 if (tid == 0) { for (int i = 1; i < blockDim.x; i++) { shared_max_vals[0] += shared_max_vals[i]; } } __syncthreads(); float total_sum = shared_max_vals[0]; // 3. 每个元素除以总和，得到 softmax 值 for (int i = tid; i < length; i += blockDim.x) { predict[i] /= total_sum; } } static void classfier_softmax(float *predict, int length, int *max_index, cudaStream_t stream) { int block_size = 256; checkKernel(softmax<<<1, block_size, block_size * sizeof(float), stream>>>(predict, length, max_index)); } bool ClassifierModelImpl::load(const std::string &engine_file, int gpu_id) { trt_ = TensorRT::load(engine_file); device_id_ = gpu_id; if (trt_ == nullptr) return false; trt_->print(); auto input_dim = trt_->static_dims(0); network_input_width_ = input_dim[3]; network_input_height_ = input_dim[2]; isdynamic_model_ = trt_->has_dynamic_dim(); num_classes_ = trt_->static_dims(1)[1]; float mean[3] = {0.485f, 0.456f, 0.406f}; float std[3] = {0.229f, 0.224f, 0.225f}; normalize_ = affine::Norm::mean_std(mean, std, 1 / 255.0f, affine::ChannelType::SwapRB); // normalize_ = affine::Norm::alpha_beta(1 / 255.0f, 0.0f, affine::ChannelType::SwapRB); return true; } void ClassifierModelImpl::preprocess(int ibatch, affine::CropResizeMatrix& matrix, int x, int y, int w, int h, void *stream) { matrix.compute( std::make_tuple(w, h), std::make_tuple(network_input_width_, network_input_height_)); size_t input_numel = network_input_width_ * network_input_height_ * 3; float *input_device = input_buffer_.gpu() + ibatch * input_numel; uint8_t *image_device = preprocess_buffer_.gpu(); uint8_t *image_host = preprocess_buffer_.cpu(); float *affine_matrix_device = affine_matrix_.gpu(); float *affine_matrix_host = affine_matrix_.cpu(); cudaStream_t stream_ = (cudaStream_t)stream; memcpy(affine_matrix_host, matrix.d2i, sizeof(matrix.d2i)); checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(matrix.d2i), cudaMemcpyHostToDevice, stream_)); affine::warp_affine_bilinear_and_normalize_plane(image_device, image.width * 3, image.width, image.height, input_device, network_input_width_, network_input_height_, affine_matrix_device, 114, normalize_, stream_); checkRuntime(cudaStreamSynchronize(stream_)); } virtual Result ClassifierModelImpl::forward(const tensor::Image &image, void *stream) { return; } virtual Result ClassifierModelImpl::forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream) { return; } virtual Result ClassifierModelImpl::forward(const tensor::Image &image, data::BoxArray& boxes, void *stream) { std::lock_guard lock(mutex_); std::vector classfier_boxes_ptr; for (auto& box : boxes) { if (std::find(box.label, class_names_.begin(), class_names_.end()) != class_names_.end()) { classfier_boxes_ptr.push_back(&box); } } int num_image = classfier_boxes_ptr.size(); if (num_image == 0){ return; } auto input_dims = trt_->static_dims(0); int infer_batch_size = input_dims[0]; if (infer_batch_size != num_image) { if (isdynamic_model_) { infer_batch_size = num_image; input_dims[0] = num_image; if (!trt_->set_run_dims(0, input_dims)) { printf("Fail to set run dims\n"); return; } } else { if (infer_batch_size < num_image) { printf( "When using static shape model, number of images[%d] must be " "less than or equal to the maximum batch[%d].", num_image, infer_batch_size); return; } } } adjust_memory(num_image, image.width, image.height); uint8_t *image_device = preprocess_buffer_.gpu(); uint8_t *image_host = preprocess_buffer_.cpu(); size_t size_image = image.width * image.height * 3; cudaStream_t stream_ = (cudaStream_t)stream; memcpy(image_host, image.bgrptr, size_image); checkRuntime( cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_)); affine::CropResizeMatrix crmatrix; for(int i = 0; i < num_image; i++) { data::Box* box_ptr = classfier_boxes_ptr[i]; int x = (int)box_ptr->left; int y = (int)box_ptr->top; int w = (int)box_ptr->right - x; int h = (int)box_ptr->bottom - y; preprocess(i, crmatrix, x, y, w, h, stream); } #ifdef TRT10 if (!trt_->forward(std::unordered_map{ { "input", input_buffer_.gpu() }, { "output", output_buffer_.gpu() } }, stream_)) { printf("Failed to tensorRT forward.\n"); return cv::Mat(); } #else std::vector bindings{input_buffer_.gpu(), output_buffer_.gpu()}; if (!trt_->forward(bindings, stream)) { printf("Failed to tensorRT forward."); return cv::Mat(); } #endif for (int ib = 0; ib < num_image; ++ib) { float *output_buffer_device = output_buffer_.gpu() + ib * num_classes_; int *classes_indices_device = classes_indices_.gpu() + ib; classfier_softmax(output_buffer_device, num_classes_, classes_indices_device, stream_); } checkRuntime(cudaMemcpyAsync(output_buffer_.cpu(), output_buffer_.gpu(), output_buffer_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_)); checkRuntime(cudaMemcpyAsync(classes_indices_.cpu(), classes_indices_.gpu(), classes_indices_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_)); checkRuntime(cudaStreamSynchronize(stream_)); for (int ib = 0; ib < num_image; ++ib) { int *max_index = classes_indices_.cpu() + ib; int index = *max_index; classfier_boxes_ptr[ib]->label = class_names_[index]; } } }