背景: 期待将GroundDino模型部署在C++端,目前准备复用github上的一个paddle实现。已经验证过该代码库(https://github.com/LokeZhou/PPGroundingDINO)的推理结果是对的,大概率证明模型和代码本身没有什么问题,但是在进一步导出模型时遇到了一些错误。希望能得到一些方向指导和解答。
当前遇到的主要问题是导出时报错,报错内容:存在未初始化的变量,但是并没有显式地指出错误的具体变量,无从下手排查。
报错信息如下: InvalidArgumentError: The Tensor of Variable(eager_tmp_12) to be saved is not initialized. [Hint: Expected tensor.IsInitialized() == true, but received tensor.IsInitialized():0 != true:1.] (at /paddle/paddle/fluid/operators/save_combine_op.h:90) [operator < save_combine > error]
我的导出代码:
from paddle.static import InputSpec
import paddle.jit
# 将模型设置为评估模式
self.backbone[0].eval()
self.backbone[0](samples.tensors)
# 创建一个随机输入张量
input_tensor = paddle.rand([1, 3, 224, 224])
mask_tensor = paddle.randn([1,224,224])
#将模型保存为静态图
static_graph = paddle.jit.to_static(self.backbone[0], input_spec=[input_tensor])
#保存模型结构到文件
paddle.jit.save(
static_graph,
os.path.join("./", 'export_model_self_backbone'))
其中self.backbone[0]就是GroundDino模型中的Swin-Transform模型,因为无法一次性导出整个GroundDino,为此先尝试导出Swin-Transform模型,该代码库的Swin-Transform与paddle-detection的实现大致一致,这里给一下init方法的代码:
import ...
def __init__(
self,
pretrain_img_size=224,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.0,
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.2,
norm_layer=nn.LayerNorm,
ape=False,
patch_norm=True,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
dilation=False,
use_checkpoint=False,
):
super().__init__()
self.pretrain_img_size = pretrain_img_size
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.ape = ape
self.patch_norm = patch_norm
self.out_indices = out_indices
self.frozen_stages = frozen_stages
self.dilation = dilation
# if use_checkpoint:
# print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!")
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None,
)
# absolute position embedding
if self.ape:
pretrain_img_size = to_2tuple(pretrain_img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [
pretrain_img_size[0] // patch_size[0],
pretrain_img_size[1] // patch_size[1],
]
self.absolute_pos_embed = self.create_parameter(
shape=[1, embed_dim, patches_resolution[0], patches_resolution[1]],
dtype=paddle.float32,
default_initializer=Constant(0.)
)
trunc_normal_(self.absolute_pos_embed, std=0.02)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = [
x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))
] # stochastic depth decay rule
# build layers
self.layers = nn.LayerList()
# prepare downsample list
downsamplelist = [PatchMerging for i in range(self.num_layers)]
downsamplelist[-1] = None
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
if self.dilation:
downsamplelist[-2] = None
num_features[-1] = int(embed_dim * 2 ** (self.num_layers - 1)) // 2
for i_layer in range(self.num_layers):
layer = BasicLayer(
# dim=int(embed_dim * 2 ** i_layer),
dim=num_features[i_layer],
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
norm_layer=norm_layer,
# downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
downsample=downsamplelist[i_layer],
use_checkpoint=use_checkpoint,
)
self.layers.append(layer)
# num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
self.num_features = num_features
# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f"norm{i_layer}"
self.add_sublayer(layer_name, layer)
self._freeze_stages()