DONT/action_classifier.py at master · mi2rl/DONT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
'''
##### DO Not Touch your face ver.0.2

### Medical Imaging & Intelligent Reality Lab (MI2RL) @ Asan Medical Center(AMC)
# MI2RL website : https://www.mi2rl.co/
# AMC : http://www.amc.seoul.kr/asan/main.do

### Developer
# Sungman Cho : dev.sungman@gmail.com
# Minjee Kim : minjeekim00@gmail.com
# Taehyeong Kim : kimtaehyeong62@gmail.com
# Junmyung Choi : jm5901@gmail.com
# Namkug Kim : namkugkim@gmail.com

### Data contributor
# MI2RL researchers
# Dongwoo Seo, Emergency Medicine@AMC
# Namkug Kim, Convergence Medicine@AMC

### references
# I3D Network (https://github.com/hassony2/kinetics_i3d_pytorch)

#####
'''

import numpy as np

import torch
import torch.nn as nn
from PIL import Image
from model.i3dpt import I3D, Unit3Dpy
from torchvision import transforms as  T
import time


class ActionClassifier:
    def __init__(self, model_path, temporal_batch_size=24, img_size=224):


        ### Multi-class
        # drinking, touching_phone, touching_keyboard
        self.classes = ['drinking', 'picking_up_phone', 'removing_mask',
                        'resting_chin_on_hand', 'rubbing_eyes', 'touching_glasses',
                        'touching_hairs', 'touching_keyboard', 'touching_nose',
                        'touching_phone', 'wearing_mask']
        '''
        self.classes = ['drinking', 'others', 'picking_up_phone', 'removing_mask',
                        'resting_chin_on_hand', 'rubbing_eyes', 'touching_glasses',
                        'touching_hairs', 'touching_nose',
                        'touching_phone', 'wearing_mask']
        '''

        # define action
        self.touching_actions = ['picking_up_phone', 'resting_chin_on_hand', 'rubbing_eyes', 'touching_hairs',
                                    'touching_nose']

        '''
        ### Binary-class
        self.classes = ['non-touching', 'touching']
        self.touching_actions = ['touching']
        '''

        # b, c, w, h
        self.model = I3D(num_classes=400, modality='rgb')

        self.model.conv3d_0c_1x1 = self._modify_lastlayer(self.model.conv3d_0c_1x1, out_ch=len(self.classes))
        self.model.softmax = torch.nn.Softmax()

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model.to(self.device)
        self.model.eval()

        if self.device is 'cuda':
            state_dict = self._change_key(torch.load(model_path, map_location='cuda:0'))
        else:
            state_dict = self._change_key(torch.load(model_path, map_location=torch.device('cpu')))
        self.model.load_state_dict(state_dict)


        self.temporal_batch_size = temporal_batch_size
        self.temporal_batch = torch.zeros((1, 3, self.temporal_batch_size, img_size, img_size))
        self.transforms = T.Compose([
            T.Resize((img_size,img_size)),
            T.ToTensor(),
            T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
            ])

        self.pred = None
        self.need_sleep = False
        self.cnt = 0

    def _set_param_requires_grad(self, feature_extracting, training_num):
        if feature_extracting:
            for i, param in enumerate(self.model.parameters()):
                if training_num >= i:
                    param.requires_grad = False


    def _modify_lastlayer(self, last_layer, out_ch):
        conv2 = Unit3Dpy(in_channels=400, out_channels=len(self.classes),
                        kernel_size=(1, 1, 1), activation=None, use_bias=True, use_bn=False)
        branch_0 = torch.nn.Sequential(last_layer, conv2)
        return branch_0

    def _change_key(self, ord_dict):
        state_dict = ord_dict.copy()

        for i, key in enumerate(ord_dict.keys()):
            key, value = state_dict.popitem(False)
            old = key
            state_dict[key.replace('module.', '') if key == old else key] = value

        return state_dict

    def run(self, img):

        # conver image to tensor
        pil_img = Image.fromarray(img)
        img_tensor = self.transforms(pil_img)

        self.temporal_batch[:, :, self.cnt, :, :] = img_tensor

        # every temporal batch frames, input image to network
        if (self.cnt == self.temporal_batch_size-1) & (self.need_sleep is False):
            start_time = time.time()
            self.temporal_batch = self.temporal_batch.to(self.device)
            out_var, out_logit = self.model(self.temporal_batch)
            out = torch.nn.functional.softmax(out_logit, 1).data.cpu()
            top_val, top_idx = torch.sort(out, 1, descending=True)
            end_time = time.time()

            self.pred = self.classes[int(top_idx[0,0].data.numpy())]
            self.score = top_val[0,0].data.numpy()
            print(self.pred, self.score)
            if (self.pred in self.touching_actions) & (self.score > 0.9) :
                self.pred = '얼굴을 만지지 마세요 !'
            else:
                self.pred = ''

            self.cnt = 0

        self.cnt += 1

        return self.pred