Skip to content

xxx #23

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: add_log_v33
Choose a base branch
from
Open

xxx #23

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions mmdet/engine/hooks/visualization_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
gt_bboxes = gt_instances.get('bboxes', None)
if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes):
gt_instances.bboxes = gt_bboxes.tensor
print(gt_labels, tokens_positive, gt_bboxes, img_path)
# print(gt_labels, tokens_positive, gt_bboxes, img_path)
pred_instances = data_sample.pred_instances
pred_instances = pred_instances[
pred_instances.scores > self.score_thr]
Expand All @@ -416,8 +416,8 @@ def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
self._visualizer.set_image(img)

for label, bbox, color in zip(gt_labels, gt_bboxes, colors):
self._visualizer.draw_bboxes(
bbox, edge_colors=color, face_colors=color, alpha=0.3)
# self._visualizer.draw_bboxes(
# bbox, edge_colors=color, face_colors=color, alpha=0.3)
self._visualizer.draw_bboxes(
bbox, edge_colors=color, alpha=1)

Expand Down Expand Up @@ -460,11 +460,11 @@ def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,

for label, bbox, color in zip(pred_labels, pred_bboxes,
colors):
self._visualizer.draw_bboxes(
bbox, edge_colors=color, face_colors=color, alpha=0.3)
# self._visualizer.draw_bboxes(
# bbox, edge_colors=color, face_colors=color, alpha=0.3)
self._visualizer.draw_bboxes(
bbox, edge_colors=color, alpha=1)
print(pred_labels, pred_bboxes, pred_scores, colors)
# print(pred_labels, pred_bboxes, pred_scores, colors)
areas = (pred_bboxes[:, 3] - pred_bboxes[:, 1]) * (
pred_bboxes[:, 2] - pred_bboxes[:, 0])
scales = _get_adaptive_scales(areas)
Expand Down
152 changes: 106 additions & 46 deletions mmdet/models/dense_heads/grounding_dino_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from ..layers import inverse_sigmoid
from .atss_vlfusion_head import convert_grounding_to_cls_scores
from .dino_head import DINOHead
import torch.nn.functional as F


class ContrastiveEmbed(nn.Module):
Expand Down Expand Up @@ -60,7 +61,7 @@ def __init__(self,
torch.Tensor([bias_value]), requires_grad=True)

def forward(self, visual_feat: Tensor, text_feat: Tensor,
text_token_mask: Tensor) -> Tensor:
text_token_mask: Tensor, need_expand=True) -> Tensor:
"""Forward function.

Args:
Expand All @@ -79,13 +80,15 @@ def forward(self, visual_feat: Tensor, text_feat: Tensor,
res = res / math.sqrt(visual_feat.shape[-1])
if self.bias is not None:
res = res + self.bias
res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))

new_res = torch.full((*res.shape[:-1], self.max_text_len),
float('-inf'),
device=res.device)
new_res[..., :res.shape[-1]] = res

if need_expand:
res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))
new_res = torch.full((*res.shape[:-1], self.max_text_len),
float('-inf'),
device=res.device)
new_res[..., :res.shape[-1]] = res
else:
res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))
new_res = res
return new_res


Expand Down Expand Up @@ -190,10 +193,16 @@ def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor,

# Major changes. The labels are 0-1 binary labels for each bbox
# and text tokens.
labels = gt_bboxes.new_full((num_bboxes, self.max_text_len),
0,
dtype=torch.float32)
labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds]
if 'positive_maps' in gt_instances:
labels = gt_bboxes.new_full((num_bboxes, self.max_text_len),
0,
dtype=torch.float32)
labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds]
else:
labels = gt_bboxes.new_full((num_bboxes,),
cls_score.size(1),
dtype=torch.long)
labels[pos_inds] = gt_instances.labels[pos_assigned_gt_inds]
label_weights = gt_bboxes.new_ones(num_bboxes)

# bbox targets
Expand All @@ -211,11 +220,12 @@ def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor,
neg_inds)

def forward(
self,
hidden_states: Tensor,
references: List[Tensor],
memory_text: Tensor,
text_token_mask: Tensor,
self,
hidden_states: Tensor,
references: List[Tensor],
memory_text: Tensor,
text_token_mask: Tensor,
need_expand=True
) -> Tuple[Tensor]:
"""Forward function.

Expand Down Expand Up @@ -257,7 +267,7 @@ def forward(
hidden_state = hidden_states[layer_id]
outputs_class = self.cls_branches[layer_id](hidden_state,
memory_text,
text_token_mask)
text_token_mask, need_expand)
tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
if reference.shape[-1] == 4:
# When `layer` is 0 and `as_two_stage` of the detector
Expand Down Expand Up @@ -319,12 +329,17 @@ def predict(self,
batch_img_metas = [
data_samples.metainfo for data_samples in batch_data_samples
]
batch_token_positive_maps = [
data_samples.token_positive_map
for data_samples in batch_data_samples
]

outs = self(hidden_states, references, memory_text, text_token_mask)
need_expand = True
batch_token_positive_maps = []
for data_samples in batch_data_samples:
if 'token_positive_map' in data_samples:
batch_token_positive_maps.append(data_samples.token_positive_map)
else:
batch_token_positive_maps.append(None)
need_expand = False

outs = self(hidden_states, references, memory_text, text_token_mask, need_expand=need_expand)

predictions = self.predict_by_feat(
*outs,
Expand Down Expand Up @@ -427,11 +442,13 @@ def _predict_by_feat_single(self,
bbox_index = indexes // num_classes
bbox_pred = bbox_pred[bbox_index]
else:
# TODO: REC
cls_score = cls_score.sigmoid()
scores, _ = cls_score.max(-1)
scores, indexes = scores.topk(max_per_img)
bbox_pred = bbox_pred[indexes]
det_labels = scores.new_zeros(scores.shape, dtype=torch.long)
scores, indexes = cls_score.view(-1).topk(max_per_img)
num_classes = cls_score.shape[-1]
det_labels = indexes % num_classes
bbox_index = indexes // num_classes
bbox_pred = bbox_pred[bbox_index]

det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
Expand Down Expand Up @@ -492,7 +509,12 @@ def loss(self, hidden_states: Tensor, references: List[Tensor],
batch_img_metas.append(data_sample.metainfo)
batch_gt_instances.append(data_sample.gt_instances)

outs = self(hidden_states, references, memory_text, text_token_mask)
if 'tokens_positive' in batch_data_samples[0]:
need_expand = True
else:
need_expand = False

outs = self(hidden_states, references, memory_text, text_token_mask, need_expand)
self.text_masks = text_token_mask
loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
batch_gt_instances, batch_img_metas, dn_meta)
Expand Down Expand Up @@ -539,22 +561,28 @@ def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
# ===== this change =====
# Loss is not computed for the padded regions of the text.
assert (self.text_masks.dim() == 2)
text_masks = self.text_masks.new_zeros(
(self.text_masks.size(0), self.max_text_len))
text_masks[:, :self.text_masks.size(1)] = self.text_masks
if 'positive_maps' in batch_gt_instances[0]:
text_masks = self.text_masks.new_zeros(
(self.text_masks.size(0), self.max_text_len))
text_masks[:, :self.text_masks.size(1)] = self.text_masks
else:
text_masks = self.text_masks
num_classes = cls_scores.size(-1)
labels = F.one_hot(labels, num_classes=num_classes + 1)
labels = labels[..., :num_classes]
text_mask = (text_masks > 0).unsqueeze(1)
text_mask = text_mask.repeat(1, cls_scores.size(1), 1)
cls_scores = torch.masked_select(cls_scores, text_mask).contiguous()

labels = torch.masked_select(labels, text_mask)
label_weights = label_weights[...,
None].repeat(1, 1, text_mask.size(-1))
None].repeat(1, 1, text_mask.size(-1))
label_weights = torch.masked_select(label_weights, text_mask)

# classification loss
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor = num_total_pos * 1.0 + \
num_total_neg * self.bg_cls_weight
num_total_neg * self.bg_cls_weight
if self.sync_cls_avg_factor:
cls_avg_factor = reduce_mean(
cls_scores.new_tensor([cls_avg_factor]))
Expand All @@ -566,6 +594,9 @@ def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
else:
loss_cls = self.loss_cls(
cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
if torch.isnan(loss_cls):
print(f'has nan of loss_cls')
loss_cls = cls_scores.sum() * 0

# Compute the average number of gt boxes across all gpus, for
# normalization purposes
Expand All @@ -578,7 +609,7 @@ def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
img_h, img_w, = img_meta['img_shape']
factor = bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0).repeat(
bbox_pred.size(0), 1)
bbox_pred.size(0), 1)
factors.append(factor)
factors = torch.cat(factors, 0)

Expand All @@ -592,10 +623,15 @@ def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
# regression IoU loss, defaultly GIoU loss
loss_iou = self.loss_iou(
bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)

if torch.isnan(loss_iou):
print(f'has nan of loss_iou')
loss_iou = bboxes.sum() * 0
# regression L1 loss
loss_bbox = self.loss_bbox(
bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
if torch.isnan(loss_bbox):
print(f'has nan of loss_bbox')
loss_bbox = bbox_preds.sum() * 0
return loss_cls, loss_bbox, loss_iou

def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
Expand Down Expand Up @@ -637,15 +673,23 @@ def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
# ===== this change =====
# Loss is not computed for the padded regions of the text.
assert (self.text_masks.dim() == 2)
text_masks = self.text_masks.new_zeros(
(self.text_masks.size(0), self.max_text_len))
text_masks[:, :self.text_masks.size(1)] = self.text_masks
if 'positive_maps' in batch_gt_instances[0]:
text_masks = self.text_masks.new_zeros(
(self.text_masks.size(0), self.max_text_len))
text_masks[:, :self.text_masks.size(1)] = self.text_masks
else:
text_masks = self.text_masks
num_classes = dn_cls_scores.size(-1)
# 临时方案,由于 _get_dn_targets_single 获取不到 dn_cls_scores
labels[labels == self.max_text_len] = num_classes
labels = F.one_hot(labels, num_classes=num_classes + 1)
labels = labels[..., :num_classes]
text_mask = (text_masks > 0).unsqueeze(1)
text_mask = text_mask.repeat(1, dn_cls_scores.size(1), 1)
cls_scores = torch.masked_select(dn_cls_scores, text_mask).contiguous()

labels = torch.masked_select(labels, text_mask)
label_weights = label_weights[...,
None].repeat(1, 1, text_mask.size(-1))
label_weights = label_weights[..., None].repeat(1, 1, text_mask.size(-1))
label_weights = torch.masked_select(label_weights, text_mask)
# =======================

Expand All @@ -667,6 +711,9 @@ def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
labels,
label_weights,
avg_factor=cls_avg_factor)
if torch.isnan(loss_cls):
print(f'has nan of dn loss_cls')
loss_cls = cls_scores.sum() * 0
else:
loss_cls = torch.zeros(
1, dtype=cls_scores.dtype, device=cls_scores.device)
Expand All @@ -682,7 +729,7 @@ def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
img_h, img_w = img_meta['img_shape']
factor = bbox_pred.new_tensor([img_w, img_h, img_w,
img_h]).unsqueeze(0).repeat(
bbox_pred.size(0), 1)
bbox_pred.size(0), 1)
factors.append(factor)
factors = torch.cat(factors)

Expand All @@ -696,15 +743,21 @@ def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
# regression IoU loss, defaultly GIoU loss
loss_iou = self.loss_iou(
bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
if torch.isnan(loss_iou):
print(f'has nan of dn loss_iou')
loss_iou = bboxes.sum() * 0

# regression L1 loss
loss_bbox = self.loss_bbox(
bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
if torch.isnan(loss_bbox):
print(f'has nan of dn loss_bbox')
loss_bbox = bbox_preds.sum() * 0
return loss_cls, loss_bbox, loss_iou

def _get_dn_targets_single(self, gt_instances: InstanceData,
img_meta: dict, dn_meta: Dict[str,
int]) -> tuple:
int]) -> tuple:
"""Get targets in denoising part for one image.

Args:
Expand Down Expand Up @@ -749,10 +802,17 @@ def _get_dn_targets_single(self, gt_instances: InstanceData,
neg_inds = pos_inds + num_queries_each_group // 2
# label targets
# this change
labels = gt_bboxes.new_full((num_denoising_queries, self.max_text_len),
0,
dtype=torch.float32)
labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds]

if 'positive_maps' in gt_instances:
labels = gt_bboxes.new_full((num_denoising_queries, self.max_text_len),
0,
dtype=torch.float32)
labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds]
else:
labels = gt_bboxes.new_full((num_denoising_queries,),
self.max_text_len,
dtype=torch.long)
labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
label_weights = gt_bboxes.new_ones(num_denoising_queries)

# bbox targets
Expand Down
18 changes: 10 additions & 8 deletions mmdet/models/detectors/grounding_dino.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,8 +329,8 @@ def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
# for text encoder
memory_text=text_dict['embedded'],
text_attention_mask=~text_token_mask,
position_ids=text_dict['position_ids'],
text_self_attention_masks=text_dict['masks'])
position_ids=text_dict.get('position_ids', None),
text_self_attention_masks=text_dict.get('masks', None))
encoder_outputs_dict = dict(
memory=memory,
memory_mask=feat_mask,
Expand All @@ -353,13 +353,15 @@ def pre_decoder(
output_memory, output_proposals = self.gen_encoder_output_proposals(
memory, memory_mask, spatial_shapes)

if ('tokens_positive' in batch_data_samples[0] and batch_data_samples[0].tokens_positive !=-1) \
or 'token_positive_map' in batch_data_samples[0]:
need_expand = True
else:
need_expand = False
enc_outputs_class = self.bbox_head.cls_branches[
self.decoder.num_layers](output_memory, memory_text,
text_token_mask)
cls_out_features = self.bbox_head.cls_branches[
self.decoder.num_layers].max_text_len
self.decoder.num_layers](output_memory, memory_text, text_token_mask, need_expand)
enc_outputs_coord_unact = self.bbox_head.reg_branches[
self.decoder.num_layers](output_memory) + output_proposals
self.decoder.num_layers](output_memory) + output_proposals

# NOTE The DINO selects top-k proposals according to scores of
# multi-class classification, while DeformDETR, where the input
Expand All @@ -370,7 +372,7 @@ def pre_decoder(

topk_score = torch.gather(
enc_outputs_class, 1,
topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features))
topk_indices.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1]))
topk_coords_unact = torch.gather(
enc_outputs_coord_unact, 1,
topk_indices.unsqueeze(-1).repeat(1, 1, 4))
Expand Down
Loading