> 文章列表 > NVIDIA jetson tensorrt加速yolov5摄像头检测

NVIDIA jetson tensorrt加速yolov5摄像头检测

NVIDIA jetson tensorrt加速yolov5摄像头检测

link

在使用摄像头直接检测目标时,检测的实时画面还是有点慢,下面是tensorrt加速过程记录。

一、设备

1、设备jetson agx xavier

2、jetpack4.6.1

3、tensorrt 8.2.1.8

4、conda虚拟环境 python=3.6

二、虚拟环境搭建及依赖

1、参考此博客安装torch

Nvidia jetson xavier agx 安装pytorch1.9.0 Gpu版_Ponnyao的博客-CSDN博客_xavier安装pytorch

2、安装pycuda

  1. conda activate pytorch #我的虚拟环境名字是pytorch
  2. pip3 install pycuda

3、虚拟环境中使用tensorrt

  1. #查看tensorrt路径
  2. sudo find / -name tensorrt*
  3. #进入虚拟环境的此路径
  4. cd /home/nvidia/archiconda/envs/pytorch/lib/python3.6/site-packages
  5. #设置软连接
  6. ln -s /usr/lib/python3.6/dist-packages/tensorrt
  7. #上一步不行的话用这个
  8. ln -s /usr/lib/python3.6/dist-packages/tensorrt/tensorrt.so

三、加速过程

        我的项目yolov5_tensorrt-深度学习文档类资源-CSDN下载

1、下载项目

以yolov5 _6.0为例

  1. mkidr yolov5_tensorrt
  2. cd yolov5_tensorrt
  3. git clone -b v6.0 https://github.com/ultralytics/yolov5.git
  4. git clone https://github.com/wang-xinyu/tensorrtx.git

2、下载yolov5s.pt文件

下载后,放到 yolov5_tensorrt/yolov5文件夹下

https://github.com/ultralytics/yolov5/releases/tag/v6.0

3、转换模型pt->wts

  1. cp yolov5_tensorrt/tensorrtx/yolov5/gen_wts.py yolov5_tensorrt/yolov5
  2. cd yolov5_tensorrt/yolov5
  3. python3 gen_wts.py -w yolov5s.pt -o yolov5s.wts

4、生成引擎文件

  1. cd yolov5_tensorrt/tensorrtx/yolov5/
  2. mkdir build
  3. cd build
  4. cp yolov5_tensorrt/yolov5/yolov5s.wts yolov5_tensorrt/tensorrtx/yolov5/build
  5. cmake ..
  6. make
  7. sudo ./yolov5 -s yolov5s.wts yolov5s.engine s

生成yolov5s.engine。

5、摄像头加速

原作者只有图片加速,下面是大神修改的摄像头加速文件。

yolov5_trt_cam.py

  1. """
  2. An example that uses TensorRT's Python api to make inferences.
  3. """
  4. import ctypes
  5. import os
  6. import shutil
  7. import random
  8. import sys
  9. import threading
  10. import time
  11. import cv2
  12. import numpy as np
  13. import pycuda.autoinit
  14. import pycuda.driver as cuda
  15. import tensorrt as trt
  16. import torch
  17. import torchvision
  18. import argparse
  19. CONF_THRESH = 0.5
  20. IOU_THRESHOLD = 0.4
  21. def get_img_path_batches(batch_size, img_dir):
  22. ret = []
  23. batch = []
  24. for root, dirs, files in os.walk(img_dir):
  25. for name in files:
  26. if len(batch) == batch_size:
  27. ret.append(batch)
  28. batch = []
  29. batch.append(os.path.join(root, name))
  30. if len(batch) > 0:
  31. ret.append(batch)
  32. return ret
  33. def plot_one_box(x, img, color=None, label=None, line_thickness=None):
  34. """
  35. description: Plots one bounding box on image img,
  36. this function comes from YoLov5 project.
  37. param:
  38. x: a box likes [x1,y1,x2,y2]
  39. img: a opencv image object
  40. color: color to draw rectangle, such as (0,255,0)
  41. label: str
  42. line_thickness: int
  43. return:
  44. no return
  45. """
  46. tl = (
  47. line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
  48. ) # line/font thickness
  49. color = color or [random.randint(0, 255) for _ in range(3)]
  50. c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
  51. cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
  52. if label:
  53. tf = max(tl - 1, 1) # font thickness
  54. t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
  55. c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
  56. cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
  57. cv2.putText(
  58. img,
  59. label,
  60. (c1[0], c1[1] - 2),
  61. 0,
  62. tl / 3,
  63. [225, 255, 255],
  64. thickness=tf,
  65. lineType=cv2.LINE_AA,
  66. )
  67. class YoLov5TRT(object):
  68. """
  69. description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
  70. """
  71. def __init__(self, engine_file_path):
  72. # Create a Context on this device,
  73. self.ctx = cuda.Device(0).make_context()
  74. stream = cuda.Stream()
  75. TRT_LOGGER = trt.Logger(trt.Logger.INFO)
  76. runtime = trt.Runtime(TRT_LOGGER)
  77. # Deserialize the engine from file
  78. with open(engine_file_path, "rb") as f:
  79. engine = runtime.deserialize_cuda_engine(f.read())
  80. context = engine.create_execution_context()
  81. host_inputs = []
  82. cuda_inputs = []
  83. host_outputs = []
  84. cuda_outputs = []
  85. bindings = []
  86. for binding in engine:
  87. print('bingding:', binding, engine.get_binding_shape(binding))
  88. size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
  89. dtype = trt.nptype(engine.get_binding_dtype(binding))
  90. # Allocate host and device buffers
  91. host_mem = cuda.pagelocked_empty(size, dtype)
  92. cuda_mem = cuda.mem_alloc(host_mem.nbytes)
  93. # Append the device buffer to device bindings.
  94. bindings.append(int(cuda_mem))
  95. # Append to the appropriate list.
  96. if engine.binding_is_input(binding):
  97. self.input_w = engine.get_binding_shape(binding)[-1]
  98. self.input_h = engine.get_binding_shape(binding)[-2]
  99. host_inputs.append(host_mem)
  100. cuda_inputs.append(cuda_mem)
  101. else:
  102. host_outputs.append(host_mem)
  103. cuda_outputs.append(cuda_mem)
  104. # Store
  105. self.stream = stream
  106. self.context = context
  107. self.engine = engine
  108. self.host_inputs = host_inputs
  109. self.cuda_inputs = cuda_inputs
  110. self.host_outputs = host_outputs
  111. self.cuda_outputs = cuda_outputs
  112. self.bindings = bindings
  113. self.batch_size = engine.max_batch_size
  114. def infer(self, input_image_path):
  115. threading.Thread.__init__(self)
  116. # Make self the active context, pushing it on top of the context stack.
  117. self.ctx.push()
  118. self.input_image_path = input_image_path
  119. # Restore
  120. stream = self.stream
  121. context = self.context
  122. engine = self.engine
  123. host_inputs = self.host_inputs
  124. cuda_inputs = self.cuda_inputs
  125. host_outputs = self.host_outputs
  126. cuda_outputs = self.cuda_outputs
  127. bindings = self.bindings
  128. # Do image preprocess
  129. batch_image_raw = []
  130. batch_origin_h = []
  131. batch_origin_w = []
  132. batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
  133. input_image, image_raw, origin_h, origin_w = self.preprocess_image(input_image_path
  134. )
  135. batch_origin_h.append(origin_h)
  136. batch_origin_w.append(origin_w)
  137. np.copyto(batch_input_image, input_image)
  138. batch_input_image = np.ascontiguousarray(batch_input_image)
  139. # Copy input image to host buffer
  140. np.copyto(host_inputs[0], batch_input_image.ravel())
  141. start = time.time()
  142. # Transfer input data to the GPU.
  143. cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
  144. # Run inference.
  145. context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
  146. # Transfer predictions back from the GPU.
  147. cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
  148. # Synchronize the stream
  149. stream.synchronize()
  150. end = time.time()
  151. # Remove any context from the top of the context stack, deactivating it.
  152. self.ctx.pop()
  153. # Here we use the first row of output in that batch_size = 1
  154. output = host_outputs[0]
  155. # Do postprocess
  156. result_boxes, result_scores, result_classid = self.post_process(
  157. output, origin_h, origin_w)
  158. # Draw rectangles and labels on the original image
  159. for j in range(len(result_boxes)):
  160. box = result_boxes[j]
  161. plot_one_box(
  162. box,
  163. image_raw,
  164. label="{}:{:.2f}".format(
  165. categories[int(result_classid[j])], result_scores[j]
  166. ),
  167. )
  168. return image_raw, end - start
  169. def destroy(self):
  170. # Remove any context from the top of the context stack, deactivating it.
  171. self.ctx.pop()
  172. def get_raw_image(self, image_path_batch):
  173. """
  174. description: Read an image from image path
  175. """
  176. for img_path in image_path_batch:
  177. yield cv2.imread(img_path)
  178. def get_raw_image_zeros(self, image_path_batch=None):
  179. """
  180. description: Ready data for warmup
  181. """
  182. for _ in range(self.batch_size):
  183. yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
  184. def preprocess_image(self, input_image_path):
  185. """
  186. description: Convert BGR image to RGB,
  187. resize and pad it to target size, normalize to [0,1],
  188. transform to NCHW format.
  189. param:
  190. input_image_path: str, image path
  191. return:
  192. image: the processed image
  193. image_raw: the original image
  194. h: original height
  195. w: original width
  196. """
  197. image_raw = input_image_path
  198. h, w, c = image_raw.shape
  199. image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
  200. # Calculate widht and height and paddings
  201. r_w = self.input_w / w
  202. r_h = self.input_h / h
  203. if r_h > r_w:
  204. tw = self.input_w
  205. th = int(r_w * h)
  206. tx1 = tx2 = 0
  207. ty1 = int((self.input_h - th) / 2)
  208. ty2 = self.input_h - th - ty1
  209. else:
  210. tw = int(r_h * w)
  211. th = self.input_h
  212. tx1 = int((self.input_w - tw) / 2)
  213. tx2 = self.input_w - tw - tx1
  214. ty1 = ty2 = 0
  215. # Resize the image with long side while maintaining ratio
  216. image = cv2.resize(image, (tw, th))
  217. # Pad the short side with (128,128,128)
  218. image = cv2.copyMakeBorder(
  219. image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128)
  220. )
  221. image = image.astype(np.float32)
  222. # Normalize to [0,1]
  223. image /= 255.0
  224. # HWC to CHW format:
  225. image = np.transpose(image, [2, 0, 1])
  226. # CHW to NCHW format
  227. image = np.expand_dims(image, axis=0)
  228. # Convert the image to row-major order, also known as "C order":
  229. image = np.ascontiguousarray(image)
  230. return image, image_raw, h, w
  231. def xywh2xyxy(self, origin_h, origin_w, x):
  232. """
  233. description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
  234. param:
  235. origin_h: height of original image
  236. origin_w: width of original image
  237. x: A boxes tensor, each row is a box [center_x, center_y, w, h]
  238. return:
  239. y: A boxes tensor, each row is a box [x1, y1, x2, y2]
  240. """
  241. y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
  242. r_w = self.input_w / origin_w
  243. r_h = self.input_h / origin_h
  244. if r_h > r_w:
  245. y[:, 0] = x[:, 0] - x[:, 2] / 2
  246. y[:, 2] = x[:, 0] + x[:, 2] / 2
  247. y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
  248. y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
  249. y /= r_w
  250. else:
  251. y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
  252. y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
  253. y[:, 1] = x[:, 1] - x[:, 3] / 2
  254. y[:, 3] = x[:, 1] + x[:, 3] / 2
  255. y /= r_h
  256. return y
  257. def post_process(self, output, origin_h, origin_w):
  258. """
  259. description: postprocess the prediction
  260. param:
  261. output: A tensor likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
  262. origin_h: height of original image
  263. origin_w: width of original image
  264. return:
  265. result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2]
  266. result_scores: finally scores, a tensor, each element is the score correspoing to box
  267. result_classid: finally classid, a tensor, each element is the classid correspoing to box
  268. """
  269. # Get the num of boxes detected
  270. num = int(output[0])
  271. # Reshape to a two dimentional ndarray
  272. pred = np.reshape(output[1:], (-1, 6))[:num, :]
  273. # to a torch Tensor
  274. pred = torch.Tensor(pred).cuda()
  275. # Get the boxes
  276. boxes = pred[:, :4]
  277. # Get the scores
  278. scores = pred[:, 4]
  279. # Get the classid
  280. classid = pred[:, 5]
  281. # Choose those boxes that score > CONF_THRESH
  282. si = scores > CONF_THRESH
  283. boxes = boxes[si, :]
  284. scores = scores[si]
  285. classid = classid[si]
  286. # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
  287. boxes = self.xywh2xyxy(origin_h, origin_w, boxes)
  288. # Do nms
  289. indices = torchvision.ops.nms(boxes, scores, iou_threshold=IOU_THRESHOLD).cpu()
  290. result_boxes = boxes[indices, :].cpu()
  291. result_scores = scores[indices].cpu()
  292. result_classid = classid[indices].cpu()
  293. return result_boxes, result_scores, result_classid
  294. class inferThread(threading.Thread):
  295. def __init__(self, yolov5_wrapper):
  296. threading.Thread.__init__(self)
  297. self.yolov5_wrapper = yolov5_wrapper
  298. def infer(self , frame):
  299. batch_image_raw, use_time = self.yolov5_wrapper.infer(frame)
  300. # for i, img_path in enumerate(self.image_path_batch):
  301. # parent, filename = os.path.split(img_path)
  302. # save_name = os.path.join('output', filename)
  303. # # Save image
  304. # cv2.imwrite(save_name, batch_image_raw[i])
  305. # print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
  306. return batch_image_raw,use_time
  307. class warmUpThread(threading.Thread):
  308. def __init__(self, yolov5_wrapper):
  309. threading.Thread.__init__(self)
  310. self.yolov5_wrapper = yolov5_wrapper
  311. def run(self):
  312. batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros())
  313. print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
  314. if __name__ == "__main__":
  315. # load custom plugins
  316. parser = argparse.ArgumentParser()
  317. parser.add_argument('--engine', nargs='+', type=str, default="build/yolov5s.engine", help='.engine path(s)')
  318. parser.add_argument('--save', type=int, default=0, help='save?')
  319. opt = parser.parse_args()
  320. PLUGIN_LIBRARY = "build/libmyplugins.so"
  321. engine_file_path = opt.engine
  322. ctypes.CDLL(PLUGIN_LIBRARY)
  323. # load coco labels
  324. categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
  325. "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
  326. "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
  327. "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
  328. "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
  329. "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
  330. "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
  331. "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
  332. "hair drier", "toothbrush"]
  333. # a YoLov5TRT instance
  334. yolov5_wrapper = YoLov5TRT(engine_file_path)
  335. cap = cv2.VideoCapture(0)
  336. try:
  337. thread1 = inferThread(yolov5_wrapper)
  338. thread1.start()
  339. thread1.join()
  340. while 1:
  341. _,frame = cap.read()
  342. img,t=thread1.infer(frame)
  343. cv2.imshow("result", img)
  344. if cv2.waitKey(1) & 0XFF == ord('q'): # 1 millisecond
  345. break
  346. finally:
  347. # destroy the instance
  348. cap.release()
  349. cv2.destroyAllWindows()
  350. yolov5_wrapper.destroy()
NVIDIA jetson tensorrt加速yolov5摄像头检测

参考

tensorrtx/yolov5 at master · wang-xinyu/tensorrtx · GitHub

Jetson AGX Xavier实现TensorRT加速YOLOv5进行实时检测_围白的尾巴的博客-CSDN博客