resnet.cu 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. #include "infer.hpp"
  2. #include "resnet.hpp"
  3. namespace resnet
  4. {
  5. using namespace std;
  6. #define GPU_BLOCK_THREADS 512
  7. #define checkRuntime(call) \
  8. do { \
  9. auto ___call__ret_code__ = (call); \
  10. if (___call__ret_code__ != cudaSuccess) { \
  11. INFO("CUDA Runtime error💥 %s # %s, code = %s [ %d ]", #call, \
  12. cudaGetErrorString(___call__ret_code__), cudaGetErrorName(___call__ret_code__), \
  13. ___call__ret_code__); \
  14. abort(); \
  15. } \
  16. } while (0)
  17. #define checkKernel(...) \
  18. do { \
  19. { (__VA_ARGS__); } \
  20. checkRuntime(cudaPeekAtLastError()); \
  21. } while (0)
  22. enum class NormType : int { None = 0, MeanStd = 1, AlphaBeta = 2 };
  23. enum class ChannelType : int { None = 0, SwapRB = 1 };
  24. /* 归一化操作,可以支持均值标准差,alpha beta,和swap RB */
  25. struct Norm {
  26. float mean[3];
  27. float std[3];
  28. float alpha, beta;
  29. NormType type = NormType::None;
  30. ChannelType channel_type = ChannelType::None;
  31. // out = (x * alpha - mean) / std
  32. static Norm mean_std(const float mean[3], const float std[3], float alpha = 1 / 255.0f,
  33. ChannelType channel_type = ChannelType::None);
  34. // out = x * alpha + beta
  35. static Norm alpha_beta(float alpha, float beta = 0, ChannelType channel_type = ChannelType::None);
  36. // None
  37. static Norm None();
  38. };
  39. Norm Norm::mean_std(const float mean[3], const float std[3], float alpha,
  40. ChannelType channel_type) {
  41. Norm out;
  42. out.type = NormType::MeanStd;
  43. out.alpha = alpha;
  44. out.channel_type = channel_type;
  45. memcpy(out.mean, mean, sizeof(out.mean));
  46. memcpy(out.std, std, sizeof(out.std));
  47. return out;
  48. }
  49. Norm Norm::alpha_beta(float alpha, float beta, ChannelType channel_type) {
  50. Norm out;
  51. out.type = NormType::AlphaBeta;
  52. out.alpha = alpha;
  53. out.beta = beta;
  54. out.channel_type = channel_type;
  55. return out;
  56. }
  57. Norm Norm::None() { return Norm(); }
  58. static dim3 grid_dims(int numJobs) {
  59. int numBlockThreads = numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
  60. return dim3(((numJobs + numBlockThreads - 1) / (float)numBlockThreads));
  61. }
  62. static dim3 block_dims(int numJobs) {
  63. return numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
  64. }
  65. inline int upbound(int n, int align = 32) { return (n + align - 1) / align * align; }
  66. static __global__ void warp_affine_bilinear_and_normalize_plane_kernel(
  67. uint8_t *src, int src_line_size, int src_width, int src_height, float *dst, int dst_width,
  68. int dst_height, uint8_t const_value_st, float *warp_affine_matrix_2_3, Norm norm) {
  69. int dx = blockDim.x * blockIdx.x + threadIdx.x;
  70. int dy = blockDim.y * blockIdx.y + threadIdx.y;
  71. if (dx >= dst_width || dy >= dst_height) return;
  72. float m_x1 = warp_affine_matrix_2_3[0];
  73. float m_y1 = warp_affine_matrix_2_3[1];
  74. float m_z1 = warp_affine_matrix_2_3[2];
  75. float m_x2 = warp_affine_matrix_2_3[3];
  76. float m_y2 = warp_affine_matrix_2_3[4];
  77. float m_z2 = warp_affine_matrix_2_3[5];
  78. float src_x = m_x1 * dx + m_y1 * dy + m_z1;
  79. float src_y = m_x2 * dx + m_y2 * dy + m_z2;
  80. float c0, c1, c2;
  81. if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
  82. // out of range
  83. c0 = const_value_st;
  84. c1 = const_value_st;
  85. c2 = const_value_st;
  86. } else {
  87. int y_low = floorf(src_y);
  88. int x_low = floorf(src_x);
  89. int y_high = y_low + 1;
  90. int x_high = x_low + 1;
  91. uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
  92. float ly = src_y - y_low;
  93. float lx = src_x - x_low;
  94. float hy = 1 - ly;
  95. float hx = 1 - lx;
  96. float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
  97. uint8_t *v1 = const_value;
  98. uint8_t *v2 = const_value;
  99. uint8_t *v3 = const_value;
  100. uint8_t *v4 = const_value;
  101. if (y_low >= 0) {
  102. if (x_low >= 0) v1 = src + y_low * src_line_size + x_low * 3;
  103. if (x_high < src_width) v2 = src + y_low * src_line_size + x_high * 3;
  104. }
  105. if (y_high < src_height) {
  106. if (x_low >= 0) v3 = src + y_high * src_line_size + x_low * 3;
  107. if (x_high < src_width) v4 = src + y_high * src_line_size + x_high * 3;
  108. }
  109. // same to opencv
  110. c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
  111. c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
  112. c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
  113. }
  114. if (norm.channel_type == ChannelType::SwapRB) {
  115. float t = c2;
  116. c2 = c0;
  117. c0 = t;
  118. }
  119. if (norm.type == NormType::MeanStd) {
  120. c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
  121. c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
  122. c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
  123. } else if (norm.type == NormType::AlphaBeta) {
  124. c0 = c0 * norm.alpha + norm.beta;
  125. c1 = c1 * norm.alpha + norm.beta;
  126. c2 = c2 * norm.alpha + norm.beta;
  127. }
  128. int area = dst_width * dst_height;
  129. float *pdst_c0 = dst + dy * dst_width + dx;
  130. float *pdst_c1 = pdst_c0 + area;
  131. float *pdst_c2 = pdst_c1 + area;
  132. *pdst_c0 = c0;
  133. *pdst_c1 = c1;
  134. *pdst_c2 = c2;
  135. }
  136. static void warp_affine_bilinear_and_normalize_plane(uint8_t *src, int src_line_size, int src_width,
  137. int src_height, float *dst, int dst_width,
  138. int dst_height, float *matrix_2_3,
  139. uint8_t const_value, const Norm &norm,
  140. cudaStream_t stream) {
  141. dim3 grid((dst_width + 31) / 32, (dst_height + 31) / 32);
  142. dim3 block(32, 32);
  143. checkKernel(warp_affine_bilinear_and_normalize_plane_kernel<<<grid, block, 0, stream>>>(
  144. src, src_line_size, src_width, src_height, dst, dst_width, dst_height, const_value,
  145. matrix_2_3, norm));
  146. }
  147. struct AffineMatrix {
  148. float i2d[6]; // image to dst(network), 2x3 matrix
  149. float d2i[6]; // dst to image, 2x3 matrix
  150. void compute(const std::tuple<int, int> &from, const std::tuple<int, int> &to) {
  151. float scale_x = get<0>(to) / (float)get<0>(from);
  152. float scale_y = get<1>(to) / (float)get<1>(from);
  153. float scale = std::min(scale_x, scale_y);
  154. // letter box
  155. // i2d[0] = scale;
  156. // i2d[1] = 0;
  157. // i2d[2] = -scale * get<0>(from) * 0.5 + get<0>(to) * 0.5 + scale * 0.5 - 0.5;
  158. // i2d[3] = 0;
  159. // i2d[4] = scale;
  160. // i2d[5] = -scale * get<1>(from) * 0.5 + get<1>(to) * 0.5 + scale * 0.5 - 0.5;
  161. // resize
  162. i2d[0] = scale;
  163. i2d[1] = 0;
  164. i2d[2] = 0;
  165. i2d[3] = 0;
  166. i2d[4] = scale;
  167. i2d[5] = 0;
  168. double D = i2d[0] * i2d[4] - i2d[1] * i2d[3];
  169. D = D != 0. ? double(1.) / D : double(0.);
  170. double A11 = i2d[4] * D, A22 = i2d[0] * D, A12 = -i2d[1] * D, A21 = -i2d[3] * D;
  171. double b1 = -A11 * i2d[2] - A12 * i2d[5];
  172. double b2 = -A21 * i2d[2] - A22 * i2d[5];
  173. d2i[0] = A11;
  174. d2i[1] = A12;
  175. d2i[2] = b1;
  176. d2i[3] = A21;
  177. d2i[4] = A22;
  178. d2i[5] = b2;
  179. }
  180. };
  181. static __global__ classfier(float *predict, cudaStream_t stream)
  182. {
  183. }
  184. class InferImpl : public Infer {
  185. public:
  186. shared_ptr<trt::Infer> trt_;
  187. string engine_file_;
  188. vector<shared_ptr<trt::Memory<unsigned char>>> preprocess_buffers_;
  189. trt::Memory<float> input_buffer_, output_array_;
  190. int network_input_width_, network_input_height_;
  191. Norm normalize_;
  192. int num_classes_ = 0;
  193. bool isdynamic_model_ = false;
  194. virtual ~InferImpl() = default;
  195. void adjust_memory(int batch_size) {
  196. // the inference batch_size
  197. size_t input_numel = network_input_width_ * network_input_height_ * 3;
  198. input_buffer_.gpu(batch_size * input_numel);
  199. output_array_.gpu(batch_size * num_classes_);
  200. output_array_.cpu(batch_size * num_classes_);
  201. if ((int)preprocess_buffers_.size() < batch_size) {
  202. for (int i = preprocess_buffers_.size(); i < batch_size; ++i)
  203. preprocess_buffers_.push_back(make_shared<trt::Memory<unsigned char>>());
  204. }
  205. }
  206. void preprocess(int ibatch, const Image &image,
  207. shared_ptr<trt::Memory<unsigned char>> preprocess_buffer,
  208. void *stream = nullptr) {
  209. AffineMatrix affine;
  210. affine.compute(make_tuple(image.width, image.height),
  211. make_tuple(network_input_width_, network_input_height_));
  212. size_t input_numel = network_input_width_ * network_input_height_ * 3;
  213. float *input_device = input_buffer_.gpu() + ibatch * input_numel;
  214. size_t size_image = image.width * image.height * 3;
  215. size_t size_matrix = upbound(sizeof(affine.d2i), 32);
  216. uint8_t *gpu_workspace = preprocess_buffer->gpu(size_matrix + size_image);
  217. float *affine_matrix_device = (float *)gpu_workspace;
  218. uint8_t *image_device = gpu_workspace + size_matrix;
  219. uint8_t *cpu_workspace = preprocess_buffer->cpu(size_matrix + size_image);
  220. float *affine_matrix_host = (float *)cpu_workspace;
  221. uint8_t *image_host = cpu_workspace + size_matrix;
  222. // speed up
  223. cudaStream_t stream_ = (cudaStream_t)stream;
  224. memcpy(image_host, image.bgrptr, size_image);
  225. memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
  226. checkRuntime(
  227. cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
  228. checkRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i),
  229. cudaMemcpyHostToDevice, stream_));
  230. warp_affine_bilinear_and_normalize_plane(image_device, image.width * 3, image.width,
  231. image.height, input_device, network_input_width_,
  232. network_input_height_, affine_matrix_device, 114,
  233. normalize_, stream_);
  234. }
  235. bool load(const string &engine_file) {
  236. trt_ = trt::load(engine_file);
  237. if (trt_ == nullptr) return false;
  238. trt_->print();
  239. auto input_dim = trt_->static_dims(0);
  240. network_input_width_ = input_dim[3];
  241. network_input_height_ = input_dim[2];
  242. isdynamic_model_ = trt_->has_dynamic_dim();
  243. normalize_ = Norm::alpha_beta(1 / 255.0f, 0.0f, ChannelType::SwapRB);
  244. num_classes_ = trt_->static_dims(1)[1];
  245. return true;
  246. }
  247. virtual Attribute forward(const Image &image, void *stream = nullptr) override {
  248. auto output = forwards({image}, stream);
  249. if (output.empty()) return {};
  250. return output[0];
  251. }
  252. virtual vector<Attribute> forwards(const vector<Image> &images, void *stream = nullptr) override {
  253. int num_image = images.size();
  254. if (num_image == 0) return {};
  255. auto input_dims = trt_->static_dims(0);
  256. int infer_batch_size = input_dims[0];
  257. if (infer_batch_size != num_image) {
  258. if (isdynamic_model_) {
  259. infer_batch_size = num_image;
  260. input_dims[0] = num_image;
  261. if (!trt_->set_run_dims(0, input_dims)) return {};
  262. } else {
  263. if (infer_batch_size < num_image) {
  264. INFO(
  265. "When using static shape model, number of images[%d] must be "
  266. "less than or equal to the maximum batch[%d].",
  267. num_image, infer_batch_size);
  268. return {};
  269. }
  270. }
  271. }
  272. adjust_memory(infer_batch_size);
  273. cudaStream_t stream_ = (cudaStream_t)stream;
  274. for (int i = 0; i < num_image; ++i)
  275. preprocess(i, images[i], preprocess_buffers_[i], stream);
  276. float *output_array_device = output_array_.gpu();
  277. vector<void *> bindings{input_buffer_.gpu(), output_array_device};
  278. if (!trt_->forward(bindings, stream)) {
  279. INFO("Failed to tensorRT forward.");
  280. return {};
  281. }
  282. checkRuntime(cudaMemcpyAsync(output_array_.cpu(), output_array_.gpu(),
  283. output_array_.gpu_bytes(), cudaMemcpyDeviceToHost, stream_));
  284. checkRuntime(cudaStreamSynchronize(stream_));
  285. printf("size : %d\n", output_array_.cpu_size());
  286. // for (int ib = 0; ib < num_image; ++ib) {
  287. // }
  288. vector<Attribute> arrout(num_image);
  289. return arrout;
  290. }
  291. };
  292. Infer *loadraw(const std::string &engine_file) {
  293. InferImpl *impl = new InferImpl();
  294. if (!impl->load(engine_file)) {
  295. delete impl;
  296. impl = nullptr;
  297. }
  298. return impl;
  299. }
  300. shared_ptr<Infer> load(const string &engine_file) {
  301. return std::shared_ptr<InferImpl>(
  302. (InferImpl *)loadraw(engine_file));
  303. }
  304. }