atomicbits · MaartenX · Jun 13, 2020 · Jun 13, 2020
diff --git a/posenet/base_model.py b/posenet/base_model.py
@@ -26,7 +26,7 @@ def preprocess_input(self, image):
         pass
 
     def predict(self, image):
-        input_image, image_scale = self.preprocess_input(image)
+        input_image, image_scale, image_padding = self.preprocess_input(image)
 
         input_image = tf.convert_to_tensor(input_image, dtype=tf.float32)
 
@@ -37,4 +37,4 @@ def predict(self, image):
         displacement_fwd_result = result[self.output_tensor_names[self.DISPLACEMENT_FWD_KEY]]
         displacement_bwd_result = result[self.output_tensor_names[self.DISPLACEMENT_BWD_KEY]]
 
-        return tf.sigmoid(heatmap_result), offsets_result, displacement_fwd_result, displacement_bwd_result, image_scale
+        return tf.sigmoid(heatmap_result), offsets_result, displacement_fwd_result, displacement_bwd_result, image_scale, image_padding
diff --git a/posenet/decode.py b/posenet/decode.py
@@ -4,7 +4,7 @@
 
 
 def traverse_to_targ_keypoint(
-        edge_id, source_keypoint, target_keypoint_id, scores, offsets, output_stride, displacements
+        edge_id, source_keypoint, target_keypoint_id, scores, offsets, output_stride, displacements, offset_refine_step = 2
 ):
     height = scores.shape[0]
     width = scores.shape[1]
@@ -15,15 +15,20 @@ def traverse_to_targ_keypoint(
     displaced_point = source_keypoint + displacements[
         source_keypoint_indices[0], source_keypoint_indices[1], edge_id]
 
+    for i in range(0, offset_refine_step):
+        displaced_point_indices = np.clip(
+            np.round(displaced_point / output_stride), a_min=0, a_max=[height - 1, width - 1]).astype(np.int32)
+
+        displaced_point = displaced_point_indices * output_stride + offsets[
+            displaced_point_indices[0], displaced_point_indices[1], target_keypoint_id]
+
     displaced_point_indices = np.clip(
         np.round(displaced_point / output_stride), a_min=0, a_max=[height - 1, width - 1]).astype(np.int32)
 
-    score = scores[displaced_point_indices[0], displaced_point_indices[1], target_keypoint_id]
-
-    image_coord = displaced_point_indices * output_stride + offsets[
-        displaced_point_indices[0], displaced_point_indices[1], target_keypoint_id]
+    score = scores[displaced_point_indices[0],
+                   displaced_point_indices[1], target_keypoint_id]
 
-    return score, image_coord
+    return score, displaced_point
 
 
 def decode_pose(

diff --git a/posenet/mobilenet.py b/posenet/mobilenet.py
@@ -10,11 +10,21 @@ def __init__(self, model_function, output_tensor_names, output_stride):
 
     def preprocess_input(self, image):
         target_width, target_height = self.valid_resolution(image.shape[1], image.shape[0])
+        # the padding to keep the aspect ratio:
+        target_aspect = target_width / target_height
+        aspect = image.shape[1] / image.shape[0]
+        if aspect < target_aspect:
+            padding = np.array([0, round(0.5 * (target_aspect * image.shape[0] - image.shape[1]))])
+        else:
+            padding = np.array([round(0.5 * ((1.0 / target_aspect) * image.shape[1] - image.shape[0])), 0])
+        image = cv2.copyMakeBorder(image, padding[0], padding[0], padding[1], padding[1],
+             cv2.BORDER_CONSTANT, value=[0,0,0])
+
         # the scale that can get us back to the original width and height:
         scale = np.array([image.shape[0] / target_height, image.shape[1] / target_width])
         input_img = cv2.resize(image, (target_width, target_height), interpolation=cv2.INTER_LINEAR)
         input_img = cv2.cvtColor(input_img, cv2.COLOR_BGR2RGB).astype(np.float32)  # to RGB colors
 
         input_img = input_img * (2.0 / 255.0) - 1.0  # normalize to [-1,1]
         input_img = input_img.reshape(1, target_height, target_width, 3)  # NHWC
-        return input_img, scale
+        return input_img, scale, padding
diff --git a/posenet/posenet.py b/posenet/posenet.py
@@ -9,7 +9,7 @@ def __init__(self, model: BaseModel, min_score=0.25):
         self.min_score = min_score
 
     def estimate_multiple_poses(self, image, max_pose_detections=10):
-        heatmap_result, offsets_result, displacement_fwd_result, displacement_bwd_result, image_scale = \
+        heatmap_result, offsets_result, displacement_fwd_result, displacement_bwd_result, image_scale, image_padding = \
             self.model.predict(image)
 
         pose_scores, keypoint_scores, keypoint_coords = posenet.decode_multiple_poses(
@@ -21,7 +21,7 @@ def estimate_multiple_poses(self, image, max_pose_detections=10):
             max_pose_detections=max_pose_detections,
             min_pose_score=self.min_score)
 
-        keypoint_coords *= image_scale
+        keypoint_coords = keypoint_coords * image_scale - image_padding
 
         return pose_scores, keypoint_scores, keypoint_coords
 

diff --git a/posenet/resnet.py b/posenet/resnet.py
@@ -11,6 +11,16 @@ def __init__(self, model_function, output_tensor_names, output_stride):
 
     def preprocess_input(self, image):
         target_width, target_height = self.valid_resolution(image.shape[1], image.shape[0])
+        # the padding to keep the aspect ratio:
+        target_aspect = target_width / target_height
+        aspect = image.shape[1] / image.shape[0]
+        if aspect < target_aspect:
+            padding = np.array([0, round(0.5 * (target_aspect * image.shape[0] - image.shape[1]))])
+        else:
+            padding = np.array([round(0.5 * ((1.0 / target_aspect) * image.shape[1] - image.shape[0])), 0])
+        image = cv2.copyMakeBorder(image, padding[0], padding[0], padding[1], padding[1],
+             cv2.BORDER_CONSTANT, value=[0,0,0])
+
         # the scale that can get us back to the original width and height:
         scale = np.array([image.shape[0] / target_height, image.shape[1] / target_width])
         input_img = cv2.resize(image, (target_width, target_height), interpolation=cv2.INTER_LINEAR)
@@ -20,4 +30,4 @@ def preprocess_input(self, image):
         # See: https://github.com/tensorflow/tfjs-models/blob/master/body-pix/src/resnet.ts
         input_img = input_img + self.image_net_mean
         input_img = input_img.reshape(1, target_height, target_width, 3)  # HWC to NHWC
-        return input_img, scale
+        return input_img, scale, padding