ViTiMM/model.py at main · Cardio-AI/ViTiMM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import torch
import torch.nn as nn
import timm
from transformers import ViTModel, AutoModel, SwinModel

class MultiModalTransformer(nn.Module):
    def __init__(self, task, img_model_name, img_modalities, with_text, cxr_pretrained=False):
        super().__init__()

        if 'vit' in img_model_name:
            feature_extractor = ViTModel
            self.vision_embed_dim = 768
        elif img_model_name == 'microsoft/swin-base-patch4-window7-224-in22k':
            feature_extractor = SwinModel
            self.vision_embed_dim = 1024
        elif img_model_name == 'microsoft/swin-large-patch4-window12-384-in22k':
            feature_extractor = SwinModel
            self.vision_embed_dim = 1536

        self.img_models = nn.ModuleDict({
            mod: feature_extractor.from_pretrained(img_model_name, output_attentions=True)
            for mod in img_modalities
        })

        if cxr_pretrained and 'cxr' in img_modalities:
            if 'swin' in img_model_name and '224' in img_model_name:
                ckpt_path = 'chexpert-pretrain/results_swin224/m-epoch_3_lat.pth.tar'
            elif 'swin' in img_model_name and '384' in img_model_name:
                ckpt_path = 'chexpert-pretrain/results_swin384/m-epoch_3_lat.pth.tar'
            else:
                ckpt_path = 'chexpert-pretrain/results_vit224/m-epoch_3_lat.pth.tar'
            ckpt = torch.load(ckpt_path)['state_dict']
            search_str = 'module.feature_extractor.'
            ckpt = {k.replace(search_str,''): v for k,v in ckpt.items() if k.startswith(search_str)}

        self.with_text = with_text
        self.text_model = AutoModel.from_pretrained('roberta-base', output_attentions=True)

        self.text_embed_dim = 768
        self.projection_dim = 512
        projection_in_dim = self.vision_embed_dim*len(img_modalities)
        if with_text:
            projection_in_dim += self.text_embed_dim

        self.task = task
        num_classes_per_task = {
            'phenotyping': 25,
            'length-of-stay': 10,
            'decompensation': 1,
            'in-hospital-mortality': 1
        }
        self.num_classes = num_classes_per_task[task] # 25 if task == 'phenotyping' else 1
        self.dropout = nn.Dropout(0.1)
        self.projection = nn.Linear(projection_in_dim, self.projection_dim, bias=False)
        self.activation = nn.Tanh()
        self.classifier = nn.Linear(self.projection_dim, self.num_classes) if self.num_classes > 0 else nn.Identity()

    def forward(self, data, return_attention=False):
        image_embeds, attentions = [], {}
        for mod, vit in self.img_models.items():
            vision_outputs = vit(data[mod])
            # image_embeds_modality = vision_outputs[0][:,0,:]
            image_embeds_modality = vision_outputs[1]
            image_embeds.append(image_embeds_modality)
            if return_attention:
                attentions[mod] = [a.detach() for a in vision_outputs.attentions]
        vl_embeds = torch.cat(image_embeds, dim=1)

        if self.with_text:
            text_outputs = self.text_model(input_ids=data['input_ids'], attention_mask=data['attention_mask'])
            text_embeds = text_outputs[0][:,0,:]
            vl_embeds = torch.cat([vl_embeds, text_embeds], dim=1)
            if return_attention:
                attentions['text'] = [a.detach() for a in text_outputs.attentions]

        # vl_embeds = torch.concat([*image_embeds, text_embeds], dim=-1)
        pooled_output = self.dropout(vl_embeds)
        pooled_output = self.projection(pooled_output) # vl_embeds)
        pooled_output = self.activation(pooled_output)
        pooled_output = self.dropout(pooled_output)

        # classification logits
        logits = self.classifier(pooled_output)
        if return_attention:
            return logits, attentions
        else:
            return logits


class MultiModalConv(nn.Module):
    def __init__(self, task, img_modalities, with_text, img_model_name=None):
        super().__init__()

        self.img_models = nn.ModuleDict()
        for mod in img_modalities:
            feature_extractor = timm.create_model('convnext_base', pretrained=True) # 'convnext_base'
            feature_extractor.head = nn.Identity()
            self.img_models[mod] = feature_extractor

        self.with_text = with_text
        self.text_model = AutoModel.from_pretrained('roberta-base', output_attentions=True)

        self.vision_embed_dim = 1024
        self.text_embed_dim = 768
        self.projection_dim = 512
        projection_in_dim = self.vision_embed_dim*len(img_modalities)
        if with_text:
            projection_in_dim += self.text_embed_dim
        self.num_classes = 25 if task == 'phenotyping' else 1
        self.dropout = nn.Dropout(0.1)
        self.projection = nn.Linear(projection_in_dim, self.projection_dim, bias=False)
        self.activation = nn.Tanh()
        self.classifier = nn.Linear(self.projection_dim, self.num_classes) if self.num_classes > 0 else nn.Identity()

    def forward(self, data, return_attention=False):
        image_embeds, attentions = [], {}
        for mod, cnn in self.img_models.items():
            vision_outputs = cnn(data[mod])
            # image_embeds_modality = vision_outputs[0][:,0,:]
            image_embeds_modality = vision_outputs.mean(dim=(2,3))
            image_embeds.append(image_embeds_modality)
        vl_embeds = torch.cat(image_embeds, dim=1)

        if self.with_text:
            text_outputs = self.text_model(input_ids=data['input_ids'], attention_mask=data['attention_mask'])
            text_embeds = text_outputs[0][:,0,:]
            vl_embeds = torch.cat([vl_embeds, text_embeds], dim=1)

        # vl_embeds = torch.concat([*image_embeds, text_embeds], dim=-1)
        pooled_output = self.dropout(vl_embeds)
        pooled_output = self.projection(vl_embeds)
        pooled_output = self.activation(pooled_output)
        pooled_output = self.dropout(pooled_output)

        # classification logits
        logits = self.classifier(pooled_output)
        return logits