Conversation
| } | ||
|
|
||
|
|
||
| class DeformableDetrConfig(PretrainedConfig): |
| encoder_n_points (`int`, *optional*, defaults to 4): | ||
| The number of sampled keys in each feature level for each attention head in the encoder. | ||
| decoder_n_points (`int`, *optional*, defaults to 4): | ||
| The number of sampled keys in each feature level for each attention head in the decoder. |
There was a problem hiding this comment.
deformable attention에서 필요한 config
| two_stage (`bool`, *optional*, defaults to `False`): | ||
| Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of | ||
| Deformable DETR, which are further fed into the decoder for iterative bounding box refinement. | ||
| two_stage_num_proposals (`int`, *optional*, defaults to 300): | ||
| The number of region proposals to be generated, in case `two_stage` is set to `True`. |
There was a problem hiding this comment.
two-stage model 관련 config들
| with_box_refine (`bool`, *optional*, defaults to `False`): | ||
| Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes | ||
| based on the predictions from the previous layer. |
There was a problem hiding this comment.
with_box_refine : 이전 레이어에서 나온 bbox를 초기값으로 사용함
| num_feature_levels (`int`, *optional*, defaults to 4): | ||
| The number of input feature levels. |
| if self.config.two_stage: | ||
| object_query_embedding, output_proposals = self.gen_encoder_output_proposals( | ||
| encoder_outputs[0], ~mask_flatten, spatial_shapes | ||
| ) | ||
|
|
||
| # hack implementation for two-stage Deformable DETR | ||
| # apply a detection head to each pixel (A.4 in paper) | ||
| # linear projection for bounding box binary classification (i.e. foreground and background) | ||
| enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) | ||
| # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) | ||
| delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) | ||
| enc_outputs_coord_logits = delta_bbox + output_proposals | ||
|
|
||
| # only keep top scoring `config.two_stage_num_proposals` proposals | ||
| topk = self.config.two_stage_num_proposals | ||
| topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] | ||
| topk_coords_logits = torch.gather( | ||
| enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) | ||
| ) | ||
|
|
||
| topk_coords_logits = topk_coords_logits.detach() | ||
| reference_points = topk_coords_logits.sigmoid() | ||
| init_reference_points = reference_points | ||
| pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) | ||
| query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) |
There was a problem hiding this comment.
two-stage는 일단 모든 픽셀에 대해 bbox들을 뽑고 top k의 bbox coordinate을 positional embedding을 query_embed로 줌
| """, | ||
| DEFORMABLE_DETR_START_DOCSTRING, | ||
| ) | ||
| class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel): |
|
|
||
| @add_start_docstrings_to_model_forward(DEFORMABLE_DETR_INPUTS_DOCSTRING) | ||
| @replace_return_docstrings(output_type=DeformableDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) | ||
| def forward( |
| for level in range(hidden_states.shape[0]): | ||
| if level == 0: | ||
| reference = init_reference | ||
| else: | ||
| reference = inter_references[level - 1] | ||
| reference = inverse_sigmoid(reference) | ||
| outputs_class = self.class_embed[level](hidden_states[level]) | ||
| delta_bbox = self.bbox_embed[level](hidden_states[level]) | ||
| if reference.shape[-1] == 4: | ||
| outputs_coord_logits = delta_bbox + reference | ||
| elif reference.shape[-1] == 2: | ||
| delta_bbox[..., :2] += reference | ||
| outputs_coord_logits = delta_bbox |
There was a problem hiding this comment.
전 결과 값의 역시그모이드 + dx의 sigmoid 별거 아니고 0~1 값으로 해주려고 하는 짓임
|
|
||
|
|
||
| # Copied from transformers.models.detr.modeling_detr.DetrHungarianMatcher | ||
| class DeformableDetrHungarianMatcher(nn.Module): |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| """Feature extractor class for Deformable DETR.""" |
There was a problem hiding this comment.
원래 DETR이랑 다른건 다 같은데 post_process가 다르다고 함!
the postprocessing of Deformable DETR is actually different compared to regular DETR. Namely, a sigmoid activation function is used rather than softmax, and the no-object class is included, whereas DETR discards this class.
| return color | ||
|
|
||
|
|
||
| class DeformableDetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin): |
There was a problem hiding this comment.
겸사겸사해서 FeatureExtractor를 보자! 이 클래스는 전처리하는 class임!
| do_resize (`bool`, *optional*, defaults to `True`): | ||
| Whether to resize the input to a certain `size`. |
There was a problem hiding this comment.
resize=True! 어떻게 resize해주려나
| size (`int`, *optional*, defaults to 800): | ||
| Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a | ||
| sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of | ||
| the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size * | ||
| height / width, size)`. |
There was a problem hiding this comment.
width, height로 들어가면 거기에 맞추고, 하나만 들어가면 단축을 size에 맞게 맞춤
| max_size (`int`, *optional*, defaults to 1333): | ||
| The largest size an image dimension can have (otherwise it's capped). Only has an effect if `do_resize` is | ||
| set to `True`. |
There was a problem hiding this comment.
들어올 수 있는 최대(장축)의 image size.
| return image, target | ||
|
|
||
| # Copied from transformers.models.detr.feature_extraction_detr.DetrFeatureExtractor._resize | ||
| def _resize(self, image, size, target=None, max_size=None): |
| def get_size_with_aspect_ratio(image_size, size, max_size=None): | ||
| w, h = image_size | ||
| if max_size is not None: | ||
| min_original_size = float(min((w, h))) | ||
| max_original_size = float(max((w, h))) | ||
| if max_original_size / min_original_size * size > max_size: | ||
| size = int(round(max_size * min_original_size / max_original_size)) |
There was a problem hiding this comment.
max_size가 주어졌을 경우에 size 구함
| def get_size(image_size, size, max_size=None): | ||
| if isinstance(size, (list, tuple)): | ||
| return size | ||
| else: | ||
| # size returned must be (w, h) since we use PIL to resize images | ||
| # so we revert the tuple | ||
| return get_size_with_aspect_ratio(image_size, size, max_size)[::-1] |
There was a problem hiding this comment.
max_size만 주어지면 비율에 맞춰서 크기 만듦! 즉 비율은 안바뀜! -> 현재 학습 중인 DETR의 셋팅인듯?
|
|
||
| return encoded_inputs | ||
|
|
||
| def post_process(self, outputs, target_sizes): |
| if target_sizes.shape[1] != 2: | ||
| raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") | ||
|
|
||
| prob = out_logits.sigmoid() |
원본 코드는 https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py 인데 어차피 transformers 쓸거니 transformers 코드 보자 https://github.com/huggingface/transformers/tree/main/src/transformers/models/deformable_detr