from collections import OrderedDict import torchvision from torchvision.models import maxvit_t from torchvision.models.detection.backbone_utils import BackboneWithFPN from libs.vision_libs import models from libs.vision_libs.models import mobilenet_v3_large, EfficientNet_V2_S_Weights, efficientnet_v2_s, \ EfficientNet_V2_M_Weights, efficientnet_v2_m, EfficientNet_V2_L_Weights, efficientnet_v2_l, ConvNeXt_Base_Weights from libs.vision_libs.models._utils import _ovewrite_value_param, handle_legacy_interface from libs.vision_libs.models.detection import FasterRCNN from libs.vision_libs.models.detection.anchor_utils import AnchorGenerator from libs.vision_libs.models.detection.ssdlite import _mobilenet_extractor from libs.vision_libs.models.resnet import resnet50, ResNet50_Weights, resnet18 from libs.vision_libs.models.detection.backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers from libs.vision_libs.ops import misc as misc_nn_ops, MultiScaleRoIAlign from torch import nn import torch from models.base.high_reso_swin import swin_v2_t def get_resnet50_fpn(): is_trained = False trainable_backbone_layers = _validate_trainable_layers(is_trained, None, 5, 3) norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d backbone = resnet50(weights=None, progress=True, norm_layer=norm_layer) backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers) return backbone def get_resnet18_fpn(): is_trained = False trainable_backbone_layers = _validate_trainable_layers(is_trained, None, 5, 3) norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d backbone = resnet18(weights=None, progress=True, norm_layer=norm_layer) backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers) return backbone def get_mobilenet_v3_large_fpn(): is_trained = False trainable_backbone_layers = _validate_trainable_layers(is_trained, None, 6, 3) norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d backbone = mobilenet_v3_large(weights=None, progress=True, norm_layer=norm_layer) backbone = _mobilenet_extractor(backbone, True, trainable_backbone_layers) return backbone def get_convnext_fpn(): convnext = models.convnext_base(weights=ConvNeXt_Base_Weights.DEFAULT) # convnext = models.convnext_small(pretrained=True) # convnext = models.convnext_large(pretrained=True) in_channels_list = [128, 256, 512, 1024] backbone_with_fpn = BackboneWithFPN( convnext.features, return_layers={'1': '0', '3': '1', '5': '2', '7': '3'}, # 确保这些键对应到实际的层 in_channels_list=in_channels_list, out_channels=256 ) return backbone_with_fpn def get_maxvit_fpn(input_size=(224*7,224*7)): maxvit = MaxVitBackbone(input_size=input_size) # print(maxvit.named_children()) # for i,layer in enumerate(maxvit.named_children()): # print(f'layer:{i}:{layer}') test_input = torch.randn(1, 3, 224 * 7, 224 * 7) in_channels_list = [64, 64, 128, 256, 512] featmap_names = ['0', '1', '2', '3', '4', 'pool'] # print(f'featmap_names:{featmap_names}') roi_pooler = MultiScaleRoIAlign( featmap_names=featmap_names, output_size=7, sampling_ratio=2 ) backbone_with_fpn = BackboneWithFPN( maxvit, return_layers={'stem': '0', 'block0': '1', 'block1': '2', 'block2': '3', 'block3': '4'}, # 确保这些键对应到实际的层 in_channels_list=in_channels_list, out_channels=256 ) rpn_anchor_generator = get_anchor_generator(backbone_with_fpn, test_input=test_input), return backbone_with_fpn,rpn_anchor_generator,roi_pooler def get_efficientnetv2_fpn(name='efficientnet_v2_m', pretrained=True): # 加载EfficientNetV2模型 if name == 'efficientnet_v2_s': weights = EfficientNet_V2_S_Weights.IMAGENET1K_V1 if pretrained else None backbone = efficientnet_v2_s(weights=weights).features if name == 'efficientnet_v2_m': weights = EfficientNet_V2_M_Weights.IMAGENET1K_V1 if pretrained else None backbone = efficientnet_v2_m(weights=weights).features if name == 'efficientnet_v2_l': weights = EfficientNet_V2_L_Weights.IMAGENET1K_V1 if pretrained else None backbone = efficientnet_v2_l(weights=weights).features # 定义返回的层索引和名称 return_layers = {"1":"0", "2": "1", "3": "2", "4": "3", "6": "4"} input=torch.randn(1, 3, 512, 512) # out=backbone(input) # print(f'out:{out}') # 获取每个层输出通道数 in_channels_list = [] for layer_idx in [1,2, 3, 4, 6]: module = backbone[layer_idx] # print(f'efficientnet:{backbone}') if hasattr(module, 'out_channels'): in_channels_list.append(module.out_channels) elif hasattr(module[-1], 'out_channels'): # 如果module本身没有out_channels,检查最后一个子模块 in_channels_list.append(module[-1].out_channels) else: raise ValueError(f"Cannot determine out_channels for layer {layer_idx}") # 使用BackboneWithFPN包装backbone print(f'in_channels_list: {in_channels_list}') backbone_with_fpn = BackboneWithFPN( backbone=backbone, return_layers=return_layers, in_channels_list=in_channels_list, out_channels=256 ) out=backbone_with_fpn(input) print(f'out0:{out['0'].shape}') print(f'out1:{out['1'].shape}') print(f'out2:{out['2'].shape}') print(f'out3:{out['3'].shape}') print(f'out4:{out['4'].shape}') return backbone_with_fpn # 加载 ConvNeXt 模型 # convnext = models.convnext_base(pretrained=True) # convnext = models.convnext_tiny(pretrained=True) # convnext = models.convnext_small(pretrained=True) # print(convnext) # # 打印模型的所有命名层 # for name, _ in convnext.features[5].named_children(): # print(name) # 修改 ConvNeXt 以适应 Faster R-CNN # 修改 ConvNeXt 以适应 Faster R-CNN def get_anchor_generator(backbone, test_input): features = backbone(test_input) # 获取 backbone 输出的所有特征图 featmap_names = list(features.keys()) print(f'featmap_names:{featmap_names}') num_features = len(features) # 特征图数量 print(f'num_features:{num_features}') # num_features=num_features-1 # # 定义每层的 anchor 尺寸和比例 # base_sizes = [32, 64, 128] # 支持最多 4 层 # sizes = tuple((size,) for size in base_sizes[:num_features]) anchor_sizes = tuple((int(16 * 2 ** i),) for i in range(num_features)) # 自动生成不同大小 print(f'anchor_sizes:{anchor_sizes }') aspect_ratios = ((0.5, 1.0, 2.0),) * num_features print(f'aspect_ratios:{aspect_ratios}') return AnchorGenerator(sizes=anchor_sizes , aspect_ratios=aspect_ratios) class MaxVitBackbone(torch.nn.Module): def __init__(self,input_size=(224*7,224*7)): super(MaxVitBackbone, self).__init__() # 提取MaxVit的部分层作为特征提取器 maxvit_model =maxvit_t(pretrained=False,input_size=input_size) self.stem = maxvit_model.stem # Stem层 self.block0= maxvit_model.blocks[0] self.block1 = maxvit_model.blocks[1] self.block2 = maxvit_model.blocks[2] self.block3 = maxvit_model.blocks[3] def forward(self, x): print("Input size:", x.shape) x = self.stem(x) print("After stem size:", x.shape) x = self.block0(x) print("After block0 size:", x.shape) x = self.block1(x) print("After block1 size:", x.shape) x = self.block2(x) print("After block2 size:", x.shape) x = self.block3(x) print("After block3 size:", x.shape) return x from torchvision.models.feature_extraction import create_feature_extractor def get_swin_transformer_fpn(type='t'): class Trans(nn.Module): def __init__(self): super().__init__() def forward(self,x): x=x.permute(0, 3, 2, 1).contiguous() return x class SwinTransformer(nn.Module): def __init__(self,type='t'): super().__init__() swin = swin_v2_t(weights=None) if type=='t': # 加载 Swin Transformer v2 Tiny # swin = torchvision.models.swin_v2_t(weights=None) swin =swin_v2_t(weights=None) if type=='s': swin=torchvision.models.swin_v2_s(weights=None) if type=='b': swin=torchvision.models.swin_v2_b(weights=None) # for i,layer in enumerate(swin.named_children()): # print(f'layer{i}:,{layer}') # 保存需要提取的层 self.layer0 = swin.features[0] # 第0层 patch embedding self.layer1 =nn.Sequential(swin.features[1],Trans()) # 第1层 stage1 self.layer2 =nn.Sequential(Trans(),swin.features[2]) # 第2层 downsample self.layer3 =nn.Sequential(swin.features[3], Trans()) # 第3层 stage2 self.layer4 =nn.Sequential( Trans(),swin.features[4]) # 第4层 downsample self.layer5 =nn.Sequential(swin.features[5], Trans()) # 第5层 stage3 self.layer6 =nn.Sequential(Trans(),swin.features[6]) # 第6层 downsample self.layer7 =nn.Sequential(swin.features[7], Trans()) # 第7层 stage4 def forward(self, x): x = self.layer0(x) # [B, C, H, W] -> [B, H_, W_, C] print(f'x0:{x.shape}') x = self.layer1(x) print(f'x1:{x.shape}') x = self.layer2(x) x = self.layer3(x) print(f'x2:{x.shape}') x = self.layer4(x) x = self.layer5(x) print(f'x3:{x.shape}') x = self.layer6(x) x = self.layer7(x) print(f'x4:{x.shape}') return x backbone = SwinTransformer(type=type) input=torch.randn(1,3,512,512) out=backbone(input) # print(f'out:{out.keys()}') # for i,layer in enumerate(swin.features.named_children()): # print(f'layer:{i}:{layer}') # out=swin(input) # print(f'out shape:{out.shape}') # channels_list = [96, 192, 384, 768] if type=='t': channels_list = [96, 192, 384, 768] if type=='s': channels_list = [96, 192, 384, 768] if type=='b': channels_list = [128, 256, 512, 1024] backbone_with_fpn = BackboneWithFPN( # swin.features, backbone, return_layers={'layer1': '0', 'layer3': '1', 'layer5': '2', 'layer7': '3'}, in_channels_list=channels_list, out_channels=128 ) featmap_names = ['0', '1', '2', '3', 'pool'] # print(f'featmap_names:{featmap_names}') roi_pooler = MultiScaleRoIAlign( featmap_names=featmap_names, output_size=7, sampling_ratio=2 ) # out=backbone_with_fpn(input) anchor_generator = get_anchor_generator(backbone_with_fpn, test_input=input) # print(f'out:{out}') return backbone_with_fpn,roi_pooler,anchor_generator if __name__ == '__main__': # backbone_with_fpn, roi_pooler, anchor_generator=get_swin_transformer_fpn(type='t') # model=FasterRCNN(backbone=backbone_with_fpn,num_classes=3,box_roi_pool=roi_pooler,rpn_anchor_generator=anchor_generator) # input=torch.randn(3,3,512,512,device='cuda') # model.eval() # model.to('cuda') # out=model(input) # out=backbone_with_fpn(input) # print(f'out:{out.shape}') backbone=get_efficientnetv2_fpn(name='efficientnet_v2_l') print(backbone) # # maxvit = models.maxvit_t(pretrained=True) # maxvit=MaxVitBackbone() # # print(maxvit.named_children()) # # for i,layer in enumerate(maxvit.named_children()): # print(f'layer:{i}:{layer}') # # in_channels_list = [64,64,128, 256, 512] # backbone_with_fpn = BackboneWithFPN( # maxvit, # return_layers={'stem': '0','block0':'1','block1':'2','block2':'3','block3':'4'}, # 确保这些键对应到实际的层 # in_channels_list=in_channels_list, # out_channels=256 # ) # model = FasterRCNN( # backbone=backbone_with_fpn, # num_classes=91, # COCO 数据集有 91 类 # # rpn_anchor_generator=anchor_generator, # # box_roi_pool=roi_pooler # ) # # test_input = torch.randn(1, 3, 896, 896) # # with torch.no_grad(): # output = backbone_with_fpn(test_input) # # print("Output feature maps:") # for k, v in output.items(): # print(f"{k}: {v.shape}") # model.eval() # output=model(test_input)