common.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. # Copyright 2025 Yakhyokhuja Valikhujaev
  2. # Author: Yakhyokhuja Valikhujaev
  3. # GitHub: https://github.com/yakhyo
  4. import itertools
  5. import math
  6. from typing import List, Optional, Tuple
  7. import cv2
  8. import numpy as np
  9. __all__ = [
  10. 'resize_image',
  11. 'generate_anchors',
  12. 'non_max_suppression',
  13. 'decode_boxes',
  14. 'decode_landmarks',
  15. 'distance2bbox',
  16. 'distance2kps',
  17. ]
  18. def resize_image(frame, target_shape: Tuple[int, int] = (640, 640)) -> Tuple[np.ndarray, float]:
  19. """
  20. Resize an image to fit within a target shape while keeping its aspect ratio.
  21. Args:
  22. frame (np.ndarray): Input image.
  23. target_shape (Tuple[int, int]): Target size (width, height). Defaults to (640, 640).
  24. Returns:
  25. Tuple[np.ndarray, float]: Resized image on a blank canvas and the resize factor.
  26. """
  27. width, height = target_shape
  28. # Aspect-ratio preserving resize
  29. im_ratio = float(frame.shape[0]) / frame.shape[1]
  30. model_ratio = height / width
  31. if im_ratio > model_ratio:
  32. new_height = height
  33. new_width = int(new_height / im_ratio)
  34. else:
  35. new_width = width
  36. new_height = int(new_width * im_ratio)
  37. resize_factor = float(new_height) / frame.shape[0]
  38. resized_frame = cv2.resize(frame, (new_width, new_height))
  39. # Create blank image and place resized image on it
  40. image = np.zeros((height, width, 3), dtype=np.uint8)
  41. image[:new_height, :new_width, :] = resized_frame
  42. return image, resize_factor
  43. def generate_anchors(image_size: Tuple[int, int] = (640, 640)) -> np.ndarray:
  44. """
  45. Generate anchor boxes for a given image size (RetinaFace specific).
  46. Args:
  47. image_size (Tuple[int, int]): Input image size (width, height). Defaults to (640, 640).
  48. Returns:
  49. np.ndarray: Anchor box coordinates as a NumPy array with shape (num_anchors, 4).
  50. """
  51. steps = [8, 16, 32]
  52. min_sizes = [[16, 32], [64, 128], [256, 512]]
  53. anchors = []
  54. feature_maps = [[math.ceil(image_size[0] / step), math.ceil(image_size[1] / step)] for step in steps]
  55. for k, (map_height, map_width) in enumerate(feature_maps):
  56. step = steps[k]
  57. for i, j in itertools.product(range(map_height), range(map_width)):
  58. for min_size in min_sizes[k]:
  59. s_kx = min_size / image_size[1]
  60. s_ky = min_size / image_size[0]
  61. dense_cx = [x * step / image_size[1] for x in [j + 0.5]]
  62. dense_cy = [y * step / image_size[0] for y in [i + 0.5]]
  63. for cy, cx in itertools.product(dense_cy, dense_cx):
  64. anchors += [cx, cy, s_kx, s_ky]
  65. output = np.array(anchors, dtype=np.float32).reshape(-1, 4)
  66. return output
  67. def non_max_suppression(dets: np.ndarray, threshold: float) -> List[int]:
  68. """
  69. Apply Non-Maximum Suppression (NMS) to reduce overlapping bounding boxes based on a threshold.
  70. Args:
  71. dets (np.ndarray): Array of detections with each row as [x1, y1, x2, y2, score].
  72. threshold (float): IoU threshold for suppression.
  73. Returns:
  74. List[int]: Indices of bounding boxes retained after suppression.
  75. """
  76. x1 = dets[:, 0]
  77. y1 = dets[:, 1]
  78. x2 = dets[:, 2]
  79. y2 = dets[:, 3]
  80. scores = dets[:, 4]
  81. areas = (x2 - x1 + 1) * (y2 - y1 + 1)
  82. order = scores.argsort()[::-1]
  83. keep = []
  84. while order.size > 0:
  85. i = order[0]
  86. keep.append(i)
  87. xx1 = np.maximum(x1[i], x1[order[1:]])
  88. yy1 = np.maximum(y1[i], y1[order[1:]])
  89. xx2 = np.minimum(x2[i], x2[order[1:]])
  90. yy2 = np.minimum(y2[i], y2[order[1:]])
  91. w = np.maximum(0.0, xx2 - xx1 + 1)
  92. h = np.maximum(0.0, yy2 - yy1 + 1)
  93. inter = w * h
  94. ovr = inter / (areas[i] + areas[order[1:]] - inter)
  95. inds = np.where(ovr <= threshold)[0]
  96. order = order[inds + 1]
  97. return keep
  98. def decode_boxes(loc: np.ndarray, priors: np.ndarray, variances: Optional[List[float]] = None) -> np.ndarray:
  99. """
  100. Decode locations from predictions using priors to undo
  101. the encoding done for offset regression at train time (RetinaFace specific).
  102. Args:
  103. loc (np.ndarray): Location predictions for loc layers, shape: [num_priors, 4]
  104. priors (np.ndarray): Prior boxes in center-offset form, shape: [num_priors, 4]
  105. variances (Optional[List[float]]): Variances of prior boxes. Defaults to [0.1, 0.2].
  106. Returns:
  107. np.ndarray: Decoded bounding box predictions with shape [num_priors, 4]
  108. """
  109. if variances is None:
  110. variances = [0.1, 0.2]
  111. # Compute centers of predicted boxes
  112. cxcy = priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:]
  113. # Compute widths and heights of predicted boxes
  114. wh = priors[:, 2:] * np.exp(loc[:, 2:] * variances[1])
  115. # Convert center, size to corner coordinates
  116. boxes = np.zeros_like(loc)
  117. boxes[:, :2] = cxcy - wh / 2 # xmin, ymin
  118. boxes[:, 2:] = cxcy + wh / 2 # xmax, ymax
  119. return boxes
  120. def decode_landmarks(
  121. predictions: np.ndarray, priors: np.ndarray, variances: Optional[List[float]] = None
  122. ) -> np.ndarray:
  123. """
  124. Decode landmark predictions using prior boxes (RetinaFace specific).
  125. Args:
  126. predictions (np.ndarray): Landmark predictions, shape: [num_priors, 10]
  127. priors (np.ndarray): Prior boxes, shape: [num_priors, 4]
  128. variances (Optional[List[float]]): Scaling factors for landmark offsets. Defaults to [0.1, 0.2].
  129. Returns:
  130. np.ndarray: Decoded landmarks, shape: [num_priors, 10]
  131. """
  132. if variances is None:
  133. variances = [0.1, 0.2]
  134. # Reshape predictions to [num_priors, 5, 2] to process landmark points
  135. predictions = predictions.reshape(predictions.shape[0], 5, 2)
  136. # Expand priors to match (num_priors, 5, 2)
  137. priors_xy = np.repeat(priors[:, :2][:, np.newaxis, :], 5, axis=1) # (num_priors, 5, 2)
  138. priors_wh = np.repeat(priors[:, 2:][:, np.newaxis, :], 5, axis=1) # (num_priors, 5, 2)
  139. # Compute absolute landmark positions
  140. landmarks = priors_xy + predictions * variances[0] * priors_wh
  141. # Flatten back to [num_priors, 10]
  142. landmarks = landmarks.reshape(landmarks.shape[0], -1)
  143. return landmarks
  144. def distance2bbox(points: np.ndarray, distance: np.ndarray, max_shape: Optional[Tuple[int, int]] = None) -> np.ndarray:
  145. """
  146. Decode distance prediction to bounding box (SCRFD specific).
  147. Args:
  148. points (np.ndarray): Anchor points with shape (n, 2), [x, y].
  149. distance (np.ndarray): Distance from the given point to 4
  150. boundaries (left, top, right, bottom) with shape (n, 4).
  151. max_shape (Optional[Tuple[int, int]]): Shape of the image (height, width) for clipping.
  152. Returns:
  153. np.ndarray: Decoded bounding boxes with shape (n, 4) as [x1, y1, x2, y2].
  154. """
  155. x1 = points[:, 0] - distance[:, 0]
  156. y1 = points[:, 1] - distance[:, 1]
  157. x2 = points[:, 0] + distance[:, 2]
  158. y2 = points[:, 1] + distance[:, 3]
  159. if max_shape is not None:
  160. x1 = np.clip(x1, 0, max_shape[1])
  161. y1 = np.clip(y1, 0, max_shape[0])
  162. x2 = np.clip(x2, 0, max_shape[1])
  163. y2 = np.clip(y2, 0, max_shape[0])
  164. else:
  165. x1 = np.maximum(x1, 0)
  166. y1 = np.maximum(y1, 0)
  167. x2 = np.maximum(x2, 0)
  168. y2 = np.maximum(y2, 0)
  169. return np.stack([x1, y1, x2, y2], axis=-1)
  170. def distance2kps(points: np.ndarray, distance: np.ndarray, max_shape: Optional[Tuple[int, int]] = None) -> np.ndarray:
  171. """
  172. Decode distance prediction to keypoints (SCRFD specific).
  173. Args:
  174. points (np.ndarray): Anchor points with shape (n, 2), [x, y].
  175. distance (np.ndarray): Distance from the given point to keypoints with shape (n, 2k).
  176. max_shape (Optional[Tuple[int, int]]): Shape of the image (height, width) for clipping.
  177. Returns:
  178. np.ndarray: Decoded keypoints with shape (n, 2k).
  179. """
  180. preds = []
  181. for i in range(0, distance.shape[1], 2):
  182. px = points[:, i % 2] + distance[:, i]
  183. py = points[:, i % 2 + 1] + distance[:, i + 1]
  184. if max_shape is not None:
  185. px = np.clip(px, 0, max_shape[1])
  186. py = np.clip(py, 0, max_shape[0])
  187. preds.append(px)
  188. preds.append(py)
  189. return np.stack(preds, axis=-1)