__init__.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. import glob
  2. import json
  3. import logging
  4. import os
  5. import sys
  6. from pathlib import Path
  7. logger = logging.getLogger(__name__)
  8. FILE = Path(__file__).resolve()
  9. ROOT = FILE.parents[3] # YOLOv5 root directory
  10. if str(ROOT) not in sys.path:
  11. sys.path.append(str(ROOT)) # add ROOT to PATH
  12. try:
  13. import comet_ml
  14. # Project Configuration
  15. config = comet_ml.config.get_config()
  16. COMET_PROJECT_NAME = config.get_string(os.getenv('COMET_PROJECT_NAME'), 'comet.project_name', default='yolov5')
  17. except ImportError:
  18. comet_ml = None
  19. COMET_PROJECT_NAME = None
  20. import PIL
  21. import torch
  22. import torchvision.transforms as T
  23. import yaml
  24. from utils.dataloaders import img2label_paths
  25. from utils.general import check_dataset, scale_boxes, xywh2xyxy
  26. from utils.metrics import box_iou
  27. COMET_PREFIX = 'comet://'
  28. COMET_MODE = os.getenv('COMET_MODE', 'online')
  29. # Model Saving Settings
  30. COMET_MODEL_NAME = os.getenv('COMET_MODEL_NAME', 'yolov5')
  31. # Dataset Artifact Settings
  32. COMET_UPLOAD_DATASET = os.getenv('COMET_UPLOAD_DATASET', 'false').lower() == 'true'
  33. # Evaluation Settings
  34. COMET_LOG_CONFUSION_MATRIX = (os.getenv('COMET_LOG_CONFUSION_MATRIX', 'true').lower() == 'true')
  35. COMET_LOG_PREDICTIONS = os.getenv('COMET_LOG_PREDICTIONS', 'true').lower() == 'true'
  36. COMET_MAX_IMAGE_UPLOADS = int(os.getenv('COMET_MAX_IMAGE_UPLOADS', 100))
  37. # Confusion Matrix Settings
  38. CONF_THRES = float(os.getenv('CONF_THRES', 0.001))
  39. IOU_THRES = float(os.getenv('IOU_THRES', 0.6))
  40. # Batch Logging Settings
  41. COMET_LOG_BATCH_METRICS = (os.getenv('COMET_LOG_BATCH_METRICS', 'false').lower() == 'true')
  42. COMET_BATCH_LOGGING_INTERVAL = os.getenv('COMET_BATCH_LOGGING_INTERVAL', 1)
  43. COMET_PREDICTION_LOGGING_INTERVAL = os.getenv('COMET_PREDICTION_LOGGING_INTERVAL', 1)
  44. COMET_LOG_PER_CLASS_METRICS = (os.getenv('COMET_LOG_PER_CLASS_METRICS', 'false').lower() == 'true')
  45. RANK = int(os.getenv('RANK', -1))
  46. to_pil = T.ToPILImage()
  47. class CometLogger:
  48. """Log metrics, parameters, source code, models and much more
  49. with Comet
  50. """
  51. def __init__(self, opt, hyp, run_id=None, job_type='Training', **experiment_kwargs) -> None:
  52. self.job_type = job_type
  53. self.opt = opt
  54. self.hyp = hyp
  55. # Comet Flags
  56. self.comet_mode = COMET_MODE
  57. self.save_model = opt.save_period > -1
  58. self.model_name = COMET_MODEL_NAME
  59. # Batch Logging Settings
  60. self.log_batch_metrics = COMET_LOG_BATCH_METRICS
  61. self.comet_log_batch_interval = COMET_BATCH_LOGGING_INTERVAL
  62. # Dataset Artifact Settings
  63. self.upload_dataset = self.opt.upload_dataset or COMET_UPLOAD_DATASET
  64. self.resume = self.opt.resume
  65. # Default parameters to pass to Experiment objects
  66. self.default_experiment_kwargs = {
  67. 'log_code': False,
  68. 'log_env_gpu': True,
  69. 'log_env_cpu': True,
  70. 'project_name': COMET_PROJECT_NAME, }
  71. self.default_experiment_kwargs.update(experiment_kwargs)
  72. self.experiment = self._get_experiment(self.comet_mode, run_id)
  73. self.experiment.set_name(self.opt.name)
  74. self.data_dict = self.check_dataset(self.opt.data)
  75. self.class_names = self.data_dict['names']
  76. self.num_classes = self.data_dict['nc']
  77. self.logged_images_count = 0
  78. self.max_images = COMET_MAX_IMAGE_UPLOADS
  79. if run_id is None:
  80. self.experiment.log_other('Created from', 'YOLOv5')
  81. if not isinstance(self.experiment, comet_ml.OfflineExperiment):
  82. workspace, project_name, experiment_id = self.experiment.url.split('/')[-3:]
  83. self.experiment.log_other(
  84. 'Run Path',
  85. f'{workspace}/{project_name}/{experiment_id}',
  86. )
  87. self.log_parameters(vars(opt))
  88. self.log_parameters(self.opt.hyp)
  89. self.log_asset_data(
  90. self.opt.hyp,
  91. name='hyperparameters.json',
  92. metadata={'type': 'hyp-config-file'},
  93. )
  94. self.log_asset(
  95. f'{self.opt.save_dir}/opt.yaml',
  96. metadata={'type': 'opt-config-file'},
  97. )
  98. self.comet_log_confusion_matrix = COMET_LOG_CONFUSION_MATRIX
  99. if hasattr(self.opt, 'conf_thres'):
  100. self.conf_thres = self.opt.conf_thres
  101. else:
  102. self.conf_thres = CONF_THRES
  103. if hasattr(self.opt, 'iou_thres'):
  104. self.iou_thres = self.opt.iou_thres
  105. else:
  106. self.iou_thres = IOU_THRES
  107. self.log_parameters({'val_iou_threshold': self.iou_thres, 'val_conf_threshold': self.conf_thres})
  108. self.comet_log_predictions = COMET_LOG_PREDICTIONS
  109. if self.opt.bbox_interval == -1:
  110. self.comet_log_prediction_interval = (1 if self.opt.epochs < 10 else self.opt.epochs // 10)
  111. else:
  112. self.comet_log_prediction_interval = self.opt.bbox_interval
  113. if self.comet_log_predictions:
  114. self.metadata_dict = {}
  115. self.logged_image_names = []
  116. self.comet_log_per_class_metrics = COMET_LOG_PER_CLASS_METRICS
  117. self.experiment.log_others({
  118. 'comet_mode': COMET_MODE,
  119. 'comet_max_image_uploads': COMET_MAX_IMAGE_UPLOADS,
  120. 'comet_log_per_class_metrics': COMET_LOG_PER_CLASS_METRICS,
  121. 'comet_log_batch_metrics': COMET_LOG_BATCH_METRICS,
  122. 'comet_log_confusion_matrix': COMET_LOG_CONFUSION_MATRIX,
  123. 'comet_model_name': COMET_MODEL_NAME, })
  124. # Check if running the Experiment with the Comet Optimizer
  125. if hasattr(self.opt, 'comet_optimizer_id'):
  126. self.experiment.log_other('optimizer_id', self.opt.comet_optimizer_id)
  127. self.experiment.log_other('optimizer_objective', self.opt.comet_optimizer_objective)
  128. self.experiment.log_other('optimizer_metric', self.opt.comet_optimizer_metric)
  129. self.experiment.log_other('optimizer_parameters', json.dumps(self.hyp))
  130. def _get_experiment(self, mode, experiment_id=None):
  131. if mode == 'offline':
  132. if experiment_id is not None:
  133. return comet_ml.ExistingOfflineExperiment(
  134. previous_experiment=experiment_id,
  135. **self.default_experiment_kwargs,
  136. )
  137. return comet_ml.OfflineExperiment(**self.default_experiment_kwargs, )
  138. else:
  139. try:
  140. if experiment_id is not None:
  141. return comet_ml.ExistingExperiment(
  142. previous_experiment=experiment_id,
  143. **self.default_experiment_kwargs,
  144. )
  145. return comet_ml.Experiment(**self.default_experiment_kwargs)
  146. except ValueError:
  147. logger.warning('COMET WARNING: '
  148. 'Comet credentials have not been set. '
  149. 'Comet will default to offline logging. '
  150. 'Please set your credentials to enable online logging.')
  151. return self._get_experiment('offline', experiment_id)
  152. return
  153. def log_metrics(self, log_dict, **kwargs):
  154. self.experiment.log_metrics(log_dict, **kwargs)
  155. def log_parameters(self, log_dict, **kwargs):
  156. self.experiment.log_parameters(log_dict, **kwargs)
  157. def log_asset(self, asset_path, **kwargs):
  158. self.experiment.log_asset(asset_path, **kwargs)
  159. def log_asset_data(self, asset, **kwargs):
  160. self.experiment.log_asset_data(asset, **kwargs)
  161. def log_image(self, img, **kwargs):
  162. self.experiment.log_image(img, **kwargs)
  163. def log_model(self, path, opt, epoch, fitness_score, best_model=False):
  164. if not self.save_model:
  165. return
  166. model_metadata = {
  167. 'fitness_score': fitness_score[-1],
  168. 'epochs_trained': epoch + 1,
  169. 'save_period': opt.save_period,
  170. 'total_epochs': opt.epochs, }
  171. model_files = glob.glob(f'{path}/*.pt')
  172. for model_path in model_files:
  173. name = Path(model_path).name
  174. self.experiment.log_model(
  175. self.model_name,
  176. file_or_folder=model_path,
  177. file_name=name,
  178. metadata=model_metadata,
  179. overwrite=True,
  180. )
  181. def check_dataset(self, data_file):
  182. with open(data_file) as f:
  183. data_config = yaml.safe_load(f)
  184. path = data_config.get('path')
  185. if path and path.startswith(COMET_PREFIX):
  186. path = data_config['path'].replace(COMET_PREFIX, '')
  187. data_dict = self.download_dataset_artifact(path)
  188. return data_dict
  189. self.log_asset(self.opt.data, metadata={'type': 'data-config-file'})
  190. return check_dataset(data_file)
  191. def log_predictions(self, image, labelsn, path, shape, predn):
  192. if self.logged_images_count >= self.max_images:
  193. return
  194. detections = predn[predn[:, 4] > self.conf_thres]
  195. iou = box_iou(labelsn[:, 1:], detections[:, :4])
  196. mask, _ = torch.where(iou > self.iou_thres)
  197. if len(mask) == 0:
  198. return
  199. filtered_detections = detections[mask]
  200. filtered_labels = labelsn[mask]
  201. image_id = path.split('/')[-1].split('.')[0]
  202. image_name = f'{image_id}_curr_epoch_{self.experiment.curr_epoch}'
  203. if image_name not in self.logged_image_names:
  204. native_scale_image = PIL.Image.open(path)
  205. self.log_image(native_scale_image, name=image_name)
  206. self.logged_image_names.append(image_name)
  207. metadata = []
  208. for cls, *xyxy in filtered_labels.tolist():
  209. metadata.append({
  210. 'label': f'{self.class_names[int(cls)]}-gt',
  211. 'score': 100,
  212. 'box': {
  213. 'x': xyxy[0],
  214. 'y': xyxy[1],
  215. 'x2': xyxy[2],
  216. 'y2': xyxy[3]}, })
  217. for *xyxy, conf, cls in filtered_detections.tolist():
  218. metadata.append({
  219. 'label': f'{self.class_names[int(cls)]}',
  220. 'score': conf * 100,
  221. 'box': {
  222. 'x': xyxy[0],
  223. 'y': xyxy[1],
  224. 'x2': xyxy[2],
  225. 'y2': xyxy[3]}, })
  226. self.metadata_dict[image_name] = metadata
  227. self.logged_images_count += 1
  228. return
  229. def preprocess_prediction(self, image, labels, shape, pred):
  230. nl, _ = labels.shape[0], pred.shape[0]
  231. # Predictions
  232. if self.opt.single_cls:
  233. pred[:, 5] = 0
  234. predn = pred.clone()
  235. scale_boxes(image.shape[1:], predn[:, :4], shape[0], shape[1])
  236. labelsn = None
  237. if nl:
  238. tbox = xywh2xyxy(labels[:, 1:5]) # target boxes
  239. scale_boxes(image.shape[1:], tbox, shape[0], shape[1]) # native-space labels
  240. labelsn = torch.cat((labels[:, 0:1], tbox), 1) # native-space labels
  241. scale_boxes(image.shape[1:], predn[:, :4], shape[0], shape[1]) # native-space pred
  242. return predn, labelsn
  243. def add_assets_to_artifact(self, artifact, path, asset_path, split):
  244. img_paths = sorted(glob.glob(f'{asset_path}/*'))
  245. label_paths = img2label_paths(img_paths)
  246. for image_file, label_file in zip(img_paths, label_paths):
  247. image_logical_path, label_logical_path = map(lambda x: os.path.relpath(x, path), [image_file, label_file])
  248. try:
  249. artifact.add(
  250. image_file,
  251. logical_path=image_logical_path,
  252. metadata={'split': split},
  253. )
  254. artifact.add(
  255. label_file,
  256. logical_path=label_logical_path,
  257. metadata={'split': split},
  258. )
  259. except ValueError as e:
  260. logger.error('COMET ERROR: Error adding file to Artifact. Skipping file.')
  261. logger.error(f'COMET ERROR: {e}')
  262. continue
  263. return artifact
  264. def upload_dataset_artifact(self):
  265. dataset_name = self.data_dict.get('dataset_name', 'yolov5-dataset')
  266. path = str((ROOT / Path(self.data_dict['path'])).resolve())
  267. metadata = self.data_dict.copy()
  268. for key in ['train', 'val', 'test']:
  269. split_path = metadata.get(key)
  270. if split_path is not None:
  271. metadata[key] = split_path.replace(path, '')
  272. artifact = comet_ml.Artifact(name=dataset_name, artifact_type='dataset', metadata=metadata)
  273. for key in metadata.keys():
  274. if key in ['train', 'val', 'test']:
  275. if isinstance(self.upload_dataset, str) and (key != self.upload_dataset):
  276. continue
  277. asset_path = self.data_dict.get(key)
  278. if asset_path is not None:
  279. artifact = self.add_assets_to_artifact(artifact, path, asset_path, key)
  280. self.experiment.log_artifact(artifact)
  281. return
  282. def download_dataset_artifact(self, artifact_path):
  283. logged_artifact = self.experiment.get_artifact(artifact_path)
  284. artifact_save_dir = str(Path(self.opt.save_dir) / logged_artifact.name)
  285. logged_artifact.download(artifact_save_dir)
  286. metadata = logged_artifact.metadata
  287. data_dict = metadata.copy()
  288. data_dict['path'] = artifact_save_dir
  289. metadata_names = metadata.get('names')
  290. if type(metadata_names) == dict:
  291. data_dict['names'] = {int(k): v for k, v in metadata.get('names').items()}
  292. elif type(metadata_names) == list:
  293. data_dict['names'] = {int(k): v for k, v in zip(range(len(metadata_names)), metadata_names)}
  294. else:
  295. raise "Invalid 'names' field in dataset yaml file. Please use a list or dictionary"
  296. data_dict = self.update_data_paths(data_dict)
  297. return data_dict
  298. def update_data_paths(self, data_dict):
  299. path = data_dict.get('path', '')
  300. for split in ['train', 'val', 'test']:
  301. if data_dict.get(split):
  302. split_path = data_dict.get(split)
  303. data_dict[split] = (f'{path}/{split_path}' if isinstance(split, str) else [
  304. f'{path}/{x}' for x in split_path])
  305. return data_dict
  306. def on_pretrain_routine_end(self, paths):
  307. if self.opt.resume:
  308. return
  309. for path in paths:
  310. self.log_asset(str(path))
  311. if self.upload_dataset:
  312. if not self.resume:
  313. self.upload_dataset_artifact()
  314. return
  315. def on_train_start(self):
  316. self.log_parameters(self.hyp)
  317. def on_train_epoch_start(self):
  318. return
  319. def on_train_epoch_end(self, epoch):
  320. self.experiment.curr_epoch = epoch
  321. return
  322. def on_train_batch_start(self):
  323. return
  324. def on_train_batch_end(self, log_dict, step):
  325. self.experiment.curr_step = step
  326. if self.log_batch_metrics and (step % self.comet_log_batch_interval == 0):
  327. self.log_metrics(log_dict, step=step)
  328. return
  329. def on_train_end(self, files, save_dir, last, best, epoch, results):
  330. if self.comet_log_predictions:
  331. curr_epoch = self.experiment.curr_epoch
  332. self.experiment.log_asset_data(self.metadata_dict, 'image-metadata.json', epoch=curr_epoch)
  333. for f in files:
  334. self.log_asset(f, metadata={'epoch': epoch})
  335. self.log_asset(f'{save_dir}/results.csv', metadata={'epoch': epoch})
  336. if not self.opt.evolve:
  337. model_path = str(best if best.exists() else last)
  338. name = Path(model_path).name
  339. if self.save_model:
  340. self.experiment.log_model(
  341. self.model_name,
  342. file_or_folder=model_path,
  343. file_name=name,
  344. overwrite=True,
  345. )
  346. # Check if running Experiment with Comet Optimizer
  347. if hasattr(self.opt, 'comet_optimizer_id'):
  348. metric = results.get(self.opt.comet_optimizer_metric)
  349. self.experiment.log_other('optimizer_metric_value', metric)
  350. self.finish_run()
  351. def on_val_start(self):
  352. return
  353. def on_val_batch_start(self):
  354. return
  355. def on_val_batch_end(self, batch_i, images, targets, paths, shapes, outputs):
  356. if not (self.comet_log_predictions and ((batch_i + 1) % self.comet_log_prediction_interval == 0)):
  357. return
  358. for si, pred in enumerate(outputs):
  359. if len(pred) == 0:
  360. continue
  361. image = images[si]
  362. labels = targets[targets[:, 0] == si, 1:]
  363. shape = shapes[si]
  364. path = paths[si]
  365. predn, labelsn = self.preprocess_prediction(image, labels, shape, pred)
  366. if labelsn is not None:
  367. self.log_predictions(image, labelsn, path, shape, predn)
  368. return
  369. def on_val_end(self, nt, tp, fp, p, r, f1, ap, ap50, ap_class, confusion_matrix):
  370. if self.comet_log_per_class_metrics:
  371. if self.num_classes > 1:
  372. for i, c in enumerate(ap_class):
  373. class_name = self.class_names[c]
  374. self.experiment.log_metrics(
  375. {
  376. 'mAP@.5': ap50[i],
  377. 'mAP@.5:.95': ap[i],
  378. 'precision': p[i],
  379. 'recall': r[i],
  380. 'f1': f1[i],
  381. 'true_positives': tp[i],
  382. 'false_positives': fp[i],
  383. 'support': nt[c], },
  384. prefix=class_name,
  385. )
  386. if self.comet_log_confusion_matrix:
  387. epoch = self.experiment.curr_epoch
  388. class_names = list(self.class_names.values())
  389. class_names.append('background')
  390. num_classes = len(class_names)
  391. self.experiment.log_confusion_matrix(
  392. matrix=confusion_matrix.matrix,
  393. max_categories=num_classes,
  394. labels=class_names,
  395. epoch=epoch,
  396. column_label='Actual Category',
  397. row_label='Predicted Category',
  398. file_name=f'confusion-matrix-epoch-{epoch}.json',
  399. )
  400. def on_fit_epoch_end(self, result, epoch):
  401. self.log_metrics(result, epoch=epoch)
  402. def on_model_save(self, last, epoch, final_epoch, best_fitness, fi):
  403. if ((epoch + 1) % self.opt.save_period == 0 and not final_epoch) and self.opt.save_period != -1:
  404. self.log_model(last.parent, self.opt, epoch, fi, best_model=best_fitness == fi)
  405. def on_params_update(self, params):
  406. self.log_parameters(params)
  407. def finish_run(self):
  408. self.experiment.end()