classifier.cu 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. #include "infer/trt/classfier/classifier.hpp"
  2. #include <mutex>
  3. #include <vector>
  4. #include <algorithm>
  5. namespace cls
  6. {
  7. static __global__ void softmax(float *predict, int length, int *max_index) {
  8. extern __shared__ float shared_data[];
  9. float *shared_max_vals = shared_data;
  10. int *shared_max_indices = (int*)&shared_max_vals[blockDim.x];
  11. int tid = threadIdx.x;
  12. // 1. 找到最大值和最大值的下标,存储在共享内存中
  13. float max_val = -FLT_MAX;
  14. int max_idx = -1;
  15. for (int i = tid; i < length; i += blockDim.x) {
  16. if (predict[i] > max_val) {
  17. max_val = predict[i];
  18. max_idx = i;
  19. }
  20. }
  21. shared_max_vals[tid] = max_val;
  22. shared_max_indices[tid] = max_idx;
  23. __syncthreads();
  24. // 在所有线程间找到全局最大值和对应的下标
  25. if (tid == 0) {
  26. for (int i = 1; i < blockDim.x; i++) {
  27. if (shared_max_vals[i] > shared_max_vals[0]) {
  28. shared_max_vals[0] = shared_max_vals[i];
  29. shared_max_indices[0] = shared_max_indices[i];
  30. }
  31. }
  32. *max_index = shared_max_indices[0];
  33. }
  34. __syncthreads();
  35. max_val = shared_max_vals[0];
  36. // 2. 计算指数并求和
  37. float sum_exp = 0.0f;
  38. for (int i = tid; i < length; i += blockDim.x) {
  39. predict[i] = expf(predict[i] - max_val);
  40. sum_exp += predict[i];
  41. }
  42. shared_max_vals[tid] = sum_exp;
  43. __syncthreads();
  44. // 汇总所有线程的指数和
  45. if (tid == 0) {
  46. for (int i = 1; i < blockDim.x; i++) {
  47. shared_max_vals[0] += shared_max_vals[i];
  48. }
  49. }
  50. __syncthreads();
  51. float total_sum = shared_max_vals[0];
  52. // 3. 每个元素除以总和,得到 softmax 值
  53. for (int i = tid; i < length; i += blockDim.x) {
  54. predict[i] /= total_sum;
  55. }
  56. }
  57. static void classfier_softmax(float *predict, int length, int *max_index, cudaStream_t stream) {
  58. int block_size = 256;
  59. checkKernel(softmax<<<1, block_size, block_size * sizeof(float), stream>>>(predict, length, max_index));
  60. }
  61. bool ClassifierModelImpl::load(const std::string &engine_file, int gpu_id)
  62. {
  63. trt_ = TensorRT::load(engine_file);
  64. device_id_ = gpu_id;
  65. if (trt_ == nullptr) return false;
  66. trt_->print();
  67. auto input_dim = trt_->static_dims(0);
  68. network_input_width_ = input_dim[3];
  69. network_input_height_ = input_dim[2];
  70. isdynamic_model_ = trt_->has_dynamic_dim();
  71. num_classes_ = trt_->static_dims(1)[1];
  72. float mean[3] = {0.485f, 0.456f, 0.406f};
  73. float std[3] = {0.229f, 0.224f, 0.225f};
  74. normalize_ = affine::Norm::mean_std(mean, std, 1 / 255.0f, affine::ChannelType::SwapRB);
  75. // normalize_ = affine::Norm::alpha_beta(1 / 255.0f, 0.0f, affine::ChannelType::SwapRB);
  76. return true;
  77. }
  78. void ClassifierModelImpl::preprocess(int ibatch, affine::CropResizeMatrix& matrix, int x, int y, int w, int h, void *stream)
  79. {
  80. matrix.compute(
  81. std::make_tuple(w, h),
  82. std::make_tuple(network_input_width_, network_input_height_));
  83. size_t input_numel = network_input_width_ * network_input_height_ * 3;
  84. float *input_device = input_buffer_.gpu() + ibatch * input_numel;
  85. uint8_t *image_device = preprocess_buffer_.gpu();
  86. uint8_t *image_host = preprocess_buffer_.cpu();
  87. float *affine_matrix_device = affine_matrix_.gpu();
  88. float *affine_matrix_host = affine_matrix_.cpu();
  89. cudaStream_t stream_ = (cudaStream_t)stream;
  90. memcpy(affine_matrix_host, matrix.d2i, sizeof(matrix.d2i));
  91. checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(matrix.d2i),
  92. cudaMemcpyHostToDevice, stream_));
  93. affine::warp_affine_bilinear_and_normalize_plane(image_device, image.width * 3, image.width,
  94. image.height, input_device, network_input_width_,
  95. network_input_height_, affine_matrix_device, 114,
  96. normalize_, stream_);
  97. checkRuntime(cudaStreamSynchronize(stream_));
  98. }
  99. virtual Result ClassifierModelImpl::forward(const tensor::Image &image, void *stream)
  100. {
  101. return;
  102. }
  103. virtual Result ClassifierModelImpl::forward(const tensor::Image &image, int slice_width, int slice_height, float overlap_width_ratio, float overlap_height_ratio, void *stream)
  104. {
  105. return;
  106. }
  107. virtual Result ClassifierModelImpl::forward(const tensor::Image &image, data::BoxArray& boxes, void *stream)
  108. {
  109. std::lock_guard<std::mutex> lock(mutex_);
  110. std::vector<data::Box*> classfier_boxes_ptr;
  111. for (auto& box : boxes)
  112. {
  113. if (std::find(box.label, class_names_.begin(), class_names_.end()) != class_names_.end())
  114. {
  115. classfier_boxes_ptr.push_back(&box);
  116. }
  117. }
  118. int num_image = classfier_boxes_ptr.size();
  119. if (num_image == 0){ return; }
  120. auto input_dims = trt_->static_dims(0);
  121. int infer_batch_size = input_dims[0];
  122. if (infer_batch_size != num_image)
  123. {
  124. if (isdynamic_model_)
  125. {
  126. infer_batch_size = num_image;
  127. input_dims[0] = num_image;
  128. if (!trt_->set_run_dims(0, input_dims))
  129. {
  130. printf("Fail to set run dims\n");
  131. return;
  132. }
  133. }
  134. else
  135. {
  136. if (infer_batch_size < num_image)
  137. {
  138. printf(
  139. "When using static shape model, number of images[%d] must be "
  140. "less than or equal to the maximum batch[%d].",
  141. num_image, infer_batch_size);
  142. return;
  143. }
  144. }
  145. }
  146. adjust_memory(num_image, image.width, image.height);
  147. uint8_t *image_device = preprocess_buffer_.gpu();
  148. uint8_t *image_host = preprocess_buffer_.cpu();
  149. size_t size_image = image.width * image.height * 3;
  150. cudaStream_t stream_ = (cudaStream_t)stream;
  151. memcpy(image_host, image.bgrptr, size_image);
  152. checkRuntime(
  153. cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
  154. affine::CropResizeMatrix crmatrix;
  155. for(int i = 0; i < num_image; i++)
  156. {
  157. data::Box* box_ptr = classfier_boxes_ptr[i];
  158. int x = (int)box_ptr->left;
  159. int y = (int)box_ptr->top;
  160. int w = (int)box_ptr->right - x;
  161. int h = (int)box_ptr->bottom - y;
  162. preprocess(i, crmatrix, x, y, w, h, stream);
  163. }
  164. #ifdef TRT10
  165. if (!trt_->forward(std::unordered_map<std::string, const void *>{
  166. { "input", input_buffer_.gpu() },
  167. { "output", output_buffer_.gpu() }
  168. }, stream_))
  169. {
  170. printf("Failed to tensorRT forward.\n");
  171. return cv::Mat();
  172. }
  173. #else
  174. std::vector<void *> bindings{input_buffer_.gpu(), output_buffer_.gpu()};
  175. if (!trt_->forward(bindings, stream))
  176. {
  177. printf("Failed to tensorRT forward.");
  178. return cv::Mat();
  179. }
  180. #endif
  181. for (int ib = 0; ib < num_image; ++ib)
  182. {
  183. float *output_buffer_device = output_buffer_.gpu() + ib * num_classes_;
  184. int *classes_indices_device = classes_indices_.gpu() + ib;
  185. classfier_softmax(output_buffer_device, num_classes_, classes_indices_device, stream_);
  186. }
  187. checkRuntime(cudaMemcpyAsync(output_buffer_.cpu(), output_buffer_.gpu(),
  188. output_buffer_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
  189. checkRuntime(cudaMemcpyAsync(classes_indices_.cpu(), classes_indices_.gpu(),
  190. classes_indices_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
  191. checkRuntime(cudaStreamSynchronize(stream_));
  192. for (int ib = 0; ib < num_image; ++ib)
  193. {
  194. int *max_index = classes_indices_.cpu() + ib;
  195. int index = *max_index;
  196. classfier_boxes_ptr[ib]->label = class_names_[index];
  197. }
  198. }
  199. }