lstlm
/
pokouqiege


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
							from collections import OrderedDict

import torchvision
from torchvision.models import maxvit_t
from torchvision.models.detection.backbone_utils import BackboneWithFPN

from libs.vision_libs import models
from libs.vision_libs.models import mobilenet_v3_large, EfficientNet_V2_S_Weights, efficientnet_v2_s, \
    EfficientNet_V2_M_Weights, efficientnet_v2_m, EfficientNet_V2_L_Weights, efficientnet_v2_l, ConvNeXt_Base_Weights
from libs.vision_libs.models._utils import _ovewrite_value_param, handle_legacy_interface
from libs.vision_libs.models.detection import FasterRCNN
from libs.vision_libs.models.detection.anchor_utils import AnchorGenerator
from libs.vision_libs.models.detection.ssdlite import _mobilenet_extractor
from libs.vision_libs.models.resnet import resnet50, ResNet50_Weights, resnet18

from libs.vision_libs.models.detection.backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers
from libs.vision_libs.ops import misc as misc_nn_ops, MultiScaleRoIAlign
from torch import nn

import torch


def get_resnet50_fpn():
    is_trained = False
    trainable_backbone_layers = _validate_trainable_layers(is_trained, None, 5, 3)
    norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
    backbone = resnet50(weights=None, progress=True, norm_layer=norm_layer)
    backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
    return backbone


def get_resnet18_fpn():
    is_trained = False
    trainable_backbone_layers = _validate_trainable_layers(is_trained, None, 5, 3)
    norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d
    backbone = resnet18(weights=None, progress=True, norm_layer=norm_layer)
    backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers)
    return backbone


def get_mobilenet_v3_large_fpn():
    is_trained = False
    trainable_backbone_layers = _validate_trainable_layers(is_trained, None, 6, 3)
    norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d

    backbone = mobilenet_v3_large(weights=None, progress=True, norm_layer=norm_layer)
    backbone = _mobilenet_extractor(backbone, True, trainable_backbone_layers)
    return backbone

def get_convnext_fpn():
    convnext = models.convnext_base(weights=ConvNeXt_Base_Weights.DEFAULT)
    # convnext = models.convnext_small(pretrained=True)
    # convnext = models.convnext_large(pretrained=True)

    in_channels_list = [128, 256, 512, 1024]
    backbone_with_fpn = BackboneWithFPN(
        convnext.features,
        return_layers={'1': '0', '3': '1', '5': '2', '7': '3'},  # 确保这些键对应到实际的层
        in_channels_list=in_channels_list,
        out_channels=256
    )
    return backbone_with_fpn

def get_maxvit_fpn(input_size=(224*7,224*7)):
    maxvit = MaxVitBackbone(input_size=input_size)
    # print(maxvit.named_children())

    # for i,layer in enumerate(maxvit.named_children()):
    #     print(f'layer:{i}:{layer}')

    test_input = torch.randn(1, 3, 224 * 7, 224 * 7)

    in_channels_list = [64, 64, 128, 256, 512]
    featmap_names = ['0', '1', '2', '3', '4', 'pool']
    # print(f'featmap_names:{featmap_names}')
    roi_pooler = MultiScaleRoIAlign(
        featmap_names=featmap_names,
        output_size=7,
        sampling_ratio=2
    )
    backbone_with_fpn = BackboneWithFPN(
        maxvit,
        return_layers={'stem': '0', 'block0': '1', 'block1': '2', 'block2': '3', 'block3': '4'},  # ç¡®ä¿è¿äºé®å¯¹åºå°å®éçå±
        in_channels_list=in_channels_list,
        out_channels=256
    )
    rpn_anchor_generator = get_anchor_generator(backbone_with_fpn, test_input=test_input),

    return  backbone_with_fpn,rpn_anchor_generator,roi_pooler

def get_efficientnetv2_fpn(name='efficientnet_v2_m', pretrained=True):
    # 加载EfficientNetV2模型
    if name == 'efficientnet_v2_s':
        weights = EfficientNet_V2_S_Weights.IMAGENET1K_V1 if pretrained else None
        backbone = efficientnet_v2_s(weights=weights).features
    if name == 'efficientnet_v2_m':
        weights = EfficientNet_V2_M_Weights.IMAGENET1K_V1 if pretrained else None
        backbone = efficientnet_v2_m(weights=weights).features
    if name == 'efficientnet_v2_l':
        weights = EfficientNet_V2_L_Weights.IMAGENET1K_V1 if pretrained else None
        backbone = efficientnet_v2_l(weights=weights).features

    # 定义返回的层索引和名称
    return_layers = {"2": "0", "3": "1", "4": "2", "5": "3"}

    # 获取每个层输出通道数
    in_channels_list = []
    for layer_idx in [2, 3, 4, 5]:
        module = backbone[layer_idx]
        if hasattr(module, 'out_channels'):
            in_channels_list.append(module.out_channels)
        elif hasattr(module[-1], 'out_channels'):
            # 如果module本身没有out_channels，检查最后一个子模块
            in_channels_list.append(module[-1].out_channels)
        else:
            raise ValueError(f"Cannot determine out_channels for layer {layer_idx}")

    # 使用BackboneWithFPN包装backbone
    backbone_with_fpn = BackboneWithFPN(
        backbone=backbone,
        return_layers=return_layers,
        in_channels_list=in_channels_list,
        out_channels=256
    )

    return backbone_with_fpn


# 加载 ConvNeXt 模型
# convnext = models.convnext_base(pretrained=True)
# convnext = models.convnext_tiny(pretrained=True)
# convnext = models.convnext_small(pretrained=True)
# print(convnext)
# # 打印模型的所有命名层
# for name, _ in convnext.features[5].named_children():
#     print(name)

# 修改 ConvNeXt 以适应 Faster R-CNN
# 修改 ConvNeXt 以适应 Faster R-CNN

def get_anchor_generator(backbone, test_input):
    features = backbone(test_input)  # 获取 backbone 输出的所有特征图
    featmap_names = list(features.keys())
    print(f'featmap_names:{featmap_names}')
    num_features = len(features)    # 特征图数量
    print(f'num_features:{num_features}')
    # num_features=num_features-1

    # # 定义每层的 anchor 尺寸和比例
    # base_sizes = [32, 64, 128]  # 支持最多 4 层
    # sizes = tuple((size,) for size in base_sizes[:num_features])

    anchor_sizes = tuple((int(16 * 2 ** i),) for i in range(num_features))  # 自动生成不同大小
    print(f'anchor_sizes:{anchor_sizes }')
    aspect_ratios = ((0.5, 1.0, 2.0),) * num_features
    print(f'aspect_ratios:{aspect_ratios}')

    return AnchorGenerator(sizes=anchor_sizes , aspect_ratios=aspect_ratios)


class MaxVitBackbone(torch.nn.Module):
    def __init__(self,input_size=(224*7,224*7)):
        super(MaxVitBackbone, self).__init__()
        # æåMaxVitçé¨åå±ä½ä¸ºç¹å¾æåå¨
        maxvit_model =maxvit_t(pretrained=False,input_size=input_size)
        self.stem = maxvit_model.stem  # Stemå±
        self.block0= maxvit_model.blocks[0]
        self.block1 = maxvit_model.blocks[1]
        self.block2 = maxvit_model.blocks[2]
        self.block3 = maxvit_model.blocks[3]

    def forward(self, x):
        print("Input size:", x.shape)
        x = self.stem(x)
        print("After stem size:", x.shape)
        x = self.block0(x)
        print("After block0 size:", x.shape)
        x = self.block1(x)
        print("After block1 size:", x.shape)
        x = self.block2(x)
        print("After block2 size:", x.shape)
        x = self.block3(x)
        print("After block3 size:", x.shape)
        return x


from torchvision.models.feature_extraction import create_feature_extractor

def get_swin_transformer_fpn(type='t'):
    class Trans(nn.Module):
        def __init__(self):
            super().__init__()
        def forward(self,x):
            x=x.permute(0, 3, 2, 1).contiguous()
            return x


    class SwinTransformer(nn.Module):
        def __init__(self,type='t'):
            super().__init__()
            swin = torchvision.models.swin_v2_t(weights=None)
            if type=='t':
                # 加载 Swin Transformer v2 Tiny
                swin = torchvision.models.swin_v2_t(weights=None)
            if type=='s':
                swin=torchvision.models.swin_v2_s(weights=None)
            if type=='b':
                swin=torchvision.models.swin_v2_b(weights=None)

            for i,layer in enumerate(swin.named_children()):
                print(f'layer{i}:,{layer}')
            # 保存需要提取的层
            self.layer0 = swin.features[0]  # 第0层 patch embedding
            self.layer1 =nn.Sequential(swin.features[1],Trans())  # 第1层 stage1
            self.layer2 =nn.Sequential(Trans(),swin.features[2]) # 第2层 downsample
            self.layer3 =nn.Sequential(swin.features[3], Trans()) # 第3层 stage2
            self.layer4 =nn.Sequential( Trans(),swin.features[4])  # 第4层 downsample
            self.layer5 =nn.Sequential(swin.features[5], Trans())  # 第5层 stage3
            self.layer6 =nn.Sequential(Trans(),swin.features[6]) # 第6层 downsample
            self.layer7 =nn.Sequential(swin.features[7], Trans())  # 第7层 stage4

        def forward(self, x):

            x = self.layer0(x)  # [B, C, H, W] -> [B, H_, W_, C]
            print(f'x0:{x.shape}')
            x = self.layer1(x)
            print(f'x1:{x.shape}')
            x = self.layer2(x)
            x = self.layer3(x)
            print(f'x2:{x.shape}')
            x = self.layer4(x)
            x = self.layer5(x)
            print(f'x3:{x.shape}')
            x = self.layer6(x)
            x = self.layer7(x)
            print(f'x4:{x.shape}')
            return x

    backbone = SwinTransformer(type=type)
    input=torch.randn(1,3,512,512)
    out=backbone(input)
    # print(f'out:{out.keys()}')
    # for i,layer in enumerate(swin.features.named_children()):
    #     print(f'layer:{i}:{layer}')
    # out=swin(input)
    # print(f'out shape:{out.shape}')
    #

    channels_list = [96, 192, 384, 768]
    if type=='t':
        channels_list = [96, 192, 384, 768]
    if type=='s':
        channels_list = [96, 192, 384, 768]
    if type=='b':
        channels_list = [128, 256, 512, 1024]
    backbone_with_fpn = BackboneWithFPN(
        # swin.features,
        backbone,
        return_layers={'layer1': '0', 'layer3': '1', 'layer5': '2', 'layer7': '3'},
        in_channels_list=channels_list,
        out_channels=256
    )
    featmap_names = ['0', '1', '2', '3', 'pool']
    # print(f'featmap_names:{featmap_names}')
    roi_pooler = MultiScaleRoIAlign(
        featmap_names=featmap_names,
        output_size=7,
        sampling_ratio=2
    )

    # out=backbone_with_fpn(input)
    anchor_generator = get_anchor_generator(backbone_with_fpn, test_input=input)


    # print(f'out:{out}')
    return  backbone_with_fpn,roi_pooler,anchor_generator
if __name__ == '__main__':
    backbone_with_fpn, roi_pooler, anchor_generator=get_swin_transformer_fpn(type='s')
    model=FasterRCNN(backbone=backbone_with_fpn,num_classes=3,box_roi_pool=roi_pooler,rpn_anchor_generator=anchor_generator)
    input=torch.randn(3,3,512,512,device='cuda')
    model.eval()
    model.to('cuda')
    out=model(input)


    # # maxvit = models.maxvit_t(pretrained=True)
    # maxvit=MaxVitBackbone()
    # # print(maxvit.named_children())
    #
    # for i,layer in enumerate(maxvit.named_children()):
    #     print(f'layer:{i}:{layer}')
    #
    # in_channels_list = [64,64,128, 256, 512]
    # backbone_with_fpn = BackboneWithFPN(
    #     maxvit,
    #     return_layers={'stem': '0','block0':'1','block1':'2','block2':'3','block3':'4'},  # 确保这些键对应到实际的层
    #     in_channels_list=in_channels_list,
    #     out_channels=256
    # )
    # model = FasterRCNN(
    #     backbone=backbone_with_fpn,
    #     num_classes=91,  # COCO 数据集有 91 类
    #     # rpn_anchor_generator=anchor_generator,
    #     # box_roi_pool=roi_pooler
    # )
    #
    # test_input = torch.randn(1, 3, 896, 896)
    #
    # with torch.no_grad():
    #     output = backbone_with_fpn(test_input)
    #
    # print("Output feature maps:")
    # for k, v in output.items():
    #     print(f"{k}: {v.shape}")
    # model.eval()
    # output=model(test_input)