SLAM_dummy/slam.py at master · isaac0804/SLAM_dummy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#! /usr/bin/env python3
import sys
import os
import cv2
import numpy as np
from frame import Frame, denormalize, match_frames
from pointmap import Point, Map
import PIL.Image
import PIL.ImageOps

# By guessing, focal length, f
# 500 for driving.mp4
# 230 for driving2.mp4
# 1000 for driving_timelapse.mp4
F = int(os.getenv("F", "230"))

# Main classes
mapp = Map()


def exif_transpose(img):
    if not img:
        return img

    exif_orientation_tag = 274

    # Check for EXIF data (only present on some files)
    if hasattr(img, "_getexif") and isinstance(img._getexif(), dict) and exif_orientation_tag in img._getexif():
        exif_data = img._getexif()
        orientation = exif_data[exif_orientation_tag]

        # Handle EXIF Orientation
        if orientation == 1:
            # Normal image - nothing to do!
            pass
        elif orientation == 2:
            # Mirrored left to right
            img = img.transpose(PIL.Image.FLIP_LEFT_RIGHT)
        elif orientation == 3:
            # Rotated 180 degrees
            img = img.rotate(180)
        elif orientation == 4:
            # Mirrored top to bottom
            img = img.rotate(180).transpose(PIL.Image.FLIP_LEFT_RIGHT)
        elif orientation == 5:
            # Mirrored along top-left diagonal
            img = img.rotate(-90, expand=True).transpose(PIL.Image.FLIP_LEFT_RIGHT)
        elif orientation == 6:
            # Rotated 90 degrees
            img = img.rotate(-90, expand=True)
        elif orientation == 7:
            # Mirrored along top-right diagonal
            img = img.rotate(90, expand=True).transpose(PIL.Image.FLIP_LEFT_RIGHT)
        elif orientation == 8:
            # Rotated 270 degrees
            img = img.rotate(90, expand=True)

    return img


def triangulate(pose1, pose2, pts1, pts2):
    # we only need the upper matrix (3x4 matrix) to triangulate
    # return cv2.triangulatePoints(pose1[:3], pose2[:3], pts1.T, pts2.T).T
    ret = np.zeros((pts1.shape[0], 4))
    pose1 = np.linalg.inv(pose1)
    pose2 = np.linalg.inv(pose2)
    for i, p in enumerate(zip(pts1, pts2)):
        A = np.zeros((4, 4))
        A[0] = p[0][0] * pose1[2] - pose1[0]
        A[1] = p[0][1] * pose1[2] - pose1[1]
        A[2] = p[1][0] * pose2[2] - pose2[0]
        A[3] = p[1][1] * pose2[2] - pose2[1]
        _, _, vt = np.linalg.svd(A)
        ret[i] = vt[3]
    return ret


def processing_frame(image):
    # Camera intrinsics (Intrinsic Matrix includes information about camera focal length)
    H, W = image.shape[0] // 2, image.shape[1] // 2
    K = np.array([[F, 0, W // 2], [0, F, H // 2], [0, 0, 1]])
    # Resize frame and turn to grayscale
    frame_resized = cv2.resize(image, (W, H))
    gray = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2GRAY)
    frame = Frame(mapp, gray, K)
    if frame.id == 0:
        return
    print(f"***** frame {frame.id} *****")
    f1 = mapp.frames[-1]
    f2 = mapp.frames[-2]
    idx1, idx2, Rt = match_frames(f1, f2)
    # idx1 is a list contains valid indices of points in current frame
    # idx2 is a list contains indices of corresponding points (of current frame) from the previous frame
    # to get the coordinates of the valid points, we simply put f1.kps[idx1], for instance

    f1.pose = np.dot(Rt, f2.pose)  # update the new pose (a 4x4 matrix)

    # search in previous frame to see if there is same points
    for i, idx in enumerate(idx2):
        # None indicates the point detected in current frame is new, vice versa
        if f2.pts[idx] is not None:
            f2.pts[idx].add_observation(f1, idx1[i])

    # homogeneous 3D coords
    # pts4d = triangulate(f1.pose, f2.pose, f1.kps[idx1], f2.kps[idx2])
    # pts4d /= pts4d[:, 3:]
    # length of pts4d is the same as the index of idx1 (because pts4d is just converted 3d coordinates)

    good_pts4d = np.array(([f1.pts[i] is None for i in idx1]))

    # reject points without enough parrallax
    pts_tri_local = triangulate(Rt, np.eye(4), f1.kps[idx1], f2.kps[idx2])
    good_pts4d &= np.abs(pts_tri_local[:, 3]) > 0.005

    # reject the points behind the camera
    pts_tri_local /= pts_tri_local[:, 3:]
    good_pts4d &= pts_tri_local[:, 2] > 0

    # project into the world
    pts4d = np.dot(np.linalg.inv(f1.pose), pts_tri_local.T).T

    print(f"Adding    {sum(good_pts4d)} points")

    for i, p in enumerate(pts4d):
        if not good_pts4d[i]:
            continue
        u, v = int(round(f1.kpus[idx1[i], 0])), int(round(f1.kpus[idx1[i], 1]))
        pt = Point(mapp, p, frame_resized[v, u])
        pt.add_observation(f1, idx1[i])
        pt.add_observation(f2, idx2[i])

    for pt1, pt2 in zip(f1.kps[idx1], f2.kps[idx2]):
        # denormalize coordinate (current frame)
        u1, v1 = denormalize(pt1, K)
        # denormalize coordinate (last frame)
        u2, v2 = denormalize(pt2, K)

        cv2.circle(frame_resized, (u2, v2), color=(0, 0, 255), radius=3)
        cv2.circle(frame_resized, (u1, v1), color=(0, 255, 0), radius=3)
        frame_resized = cv2.line(frame_resized, (u1, v1), (u2, v2), color=(255, 0, 0))

    cv2.imshow("frame", frame_resized)
    if frame.id > 4:
        err = mapp.optimize()
        print(f"Optimize: {err} units of error")
    mapp.display()


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("%s <video.mp4>" % sys.argv[0])
        exit(-1)
    cap = cv2.VideoCapture(sys.argv[1])
    while cap.isOpened():
        ret, frame = cap.read()

        # cv2.waitKey(x) waits for x milliseconds and returns an integer value based on the key input.
        # However, we only want the last byte (8 bits) of it to prevent potential bug(activation of NumLock for instance).
        # 0xFF is a hexadecimal constant 11111111 in binary.
        # AND (&) is a bitwise operator, purpose here is to keep the last byte.
        # ord('') returns the ASCII value of the character which would be again maximum 255.
        # REMEMBER to press the desired key on the pop up window not terminal.
        # If the video ends, frame will be None, so we have to put the while loop before the frame resized.

        if cv2.waitKey(1) & 0xFF == ord('q') or frame is None:
            break
        else:
            # frame = exif_transpose(frame)
            processing_frame(frame)

    cap.release()
    cv2.destroyAllWindows()