Text-to-Image
Diffusers
Safetensors
tolgacangoz commited on
Commit
2e73038
·
verified ·
1 Parent(s): c8393df

Upload 7 files

Browse files
text_embedding_module/OCR/ocr_recog/RNN.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ from .RecSVTR import Block
5
+
6
+
7
+ class Swish(nn.Module):
8
+ def __int__(self):
9
+ super(Swish, self).__int__()
10
+
11
+ def forward(self, x):
12
+ return x * torch.sigmoid(x)
13
+
14
+
15
+ class Im2Im(nn.Module):
16
+ def __init__(self, in_channels, **kwargs):
17
+ super().__init__()
18
+ self.out_channels = in_channels
19
+
20
+ def forward(self, x):
21
+ return x
22
+
23
+
24
+ class Im2Seq(nn.Module):
25
+ def __init__(self, in_channels, **kwargs):
26
+ super().__init__()
27
+ self.out_channels = in_channels
28
+
29
+ def forward(self, x):
30
+ B, C, H, W = x.shape
31
+ # assert H == 1
32
+ x = x.reshape(B, C, H * W)
33
+ x = x.permute((0, 2, 1))
34
+ return x
35
+
36
+
37
+ class EncoderWithRNN(nn.Module):
38
+ def __init__(self, in_channels, **kwargs):
39
+ super(EncoderWithRNN, self).__init__()
40
+ hidden_size = kwargs.get("hidden_size", 256)
41
+ self.out_channels = hidden_size * 2
42
+ self.lstm = nn.LSTM(in_channels, hidden_size, bidirectional=True, num_layers=2, batch_first=True)
43
+
44
+ def forward(self, x):
45
+ self.lstm.flatten_parameters()
46
+ x, _ = self.lstm(x)
47
+ return x
48
+
49
+
50
+ class SequenceEncoder(nn.Module):
51
+ def __init__(self, in_channels, encoder_type="rnn", **kwargs):
52
+ super(SequenceEncoder, self).__init__()
53
+ self.encoder_reshape = Im2Seq(in_channels)
54
+ self.out_channels = self.encoder_reshape.out_channels
55
+ self.encoder_type = encoder_type
56
+ if encoder_type == "reshape":
57
+ self.only_reshape = True
58
+ else:
59
+ support_encoder_dict = {"reshape": Im2Seq, "rnn": EncoderWithRNN, "svtr": EncoderWithSVTR}
60
+ assert encoder_type in support_encoder_dict, "{} must in {}".format(
61
+ encoder_type, support_encoder_dict.keys()
62
+ )
63
+
64
+ self.encoder = support_encoder_dict[encoder_type](self.encoder_reshape.out_channels, **kwargs)
65
+ self.out_channels = self.encoder.out_channels
66
+ self.only_reshape = False
67
+
68
+ def forward(self, x):
69
+ if self.encoder_type != "svtr":
70
+ x = self.encoder_reshape(x)
71
+ if not self.only_reshape:
72
+ x = self.encoder(x)
73
+ return x
74
+ else:
75
+ x = self.encoder(x)
76
+ x = self.encoder_reshape(x)
77
+ return x
78
+
79
+
80
+ class ConvBNLayer(nn.Module):
81
+ def __init__(
82
+ self, in_channels, out_channels, kernel_size=3, stride=1, padding=0, bias_attr=False, groups=1, act=nn.GELU
83
+ ):
84
+ super().__init__()
85
+ self.conv = nn.Conv2d(
86
+ in_channels=in_channels,
87
+ out_channels=out_channels,
88
+ kernel_size=kernel_size,
89
+ stride=stride,
90
+ padding=padding,
91
+ groups=groups,
92
+ # weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()),
93
+ bias=bias_attr,
94
+ )
95
+ self.norm = nn.BatchNorm2d(out_channels)
96
+ self.act = Swish()
97
+
98
+ def forward(self, inputs):
99
+ out = self.conv(inputs)
100
+ out = self.norm(out)
101
+ out = self.act(out)
102
+ return out
103
+
104
+
105
+ class EncoderWithSVTR(nn.Module):
106
+ def __init__(
107
+ self,
108
+ in_channels,
109
+ dims=64, # XS
110
+ depth=2,
111
+ hidden_dims=120,
112
+ use_guide=False,
113
+ num_heads=8,
114
+ qkv_bias=True,
115
+ mlp_ratio=2.0,
116
+ drop_rate=0.1,
117
+ attn_drop_rate=0.1,
118
+ drop_path=0.0,
119
+ qk_scale=None,
120
+ ):
121
+ super(EncoderWithSVTR, self).__init__()
122
+ self.depth = depth
123
+ self.use_guide = use_guide
124
+ self.conv1 = ConvBNLayer(in_channels, in_channels // 8, padding=1, act="swish")
125
+ self.conv2 = ConvBNLayer(in_channels // 8, hidden_dims, kernel_size=1, act="swish")
126
+
127
+ self.svtr_block = nn.ModuleList(
128
+ [
129
+ Block(
130
+ dim=hidden_dims,
131
+ num_heads=num_heads,
132
+ mixer="Global",
133
+ HW=None,
134
+ mlp_ratio=mlp_ratio,
135
+ qkv_bias=qkv_bias,
136
+ qk_scale=qk_scale,
137
+ drop=drop_rate,
138
+ act_layer="swish",
139
+ attn_drop=attn_drop_rate,
140
+ drop_path=drop_path,
141
+ norm_layer="nn.LayerNorm",
142
+ epsilon=1e-05,
143
+ prenorm=False,
144
+ )
145
+ for i in range(depth)
146
+ ]
147
+ )
148
+ self.norm = nn.LayerNorm(hidden_dims, eps=1e-6)
149
+ self.conv3 = ConvBNLayer(hidden_dims, in_channels, kernel_size=1, act="swish")
150
+ # last conv-nxn, the input is concat of input tensor and conv3 output tensor
151
+ self.conv4 = ConvBNLayer(2 * in_channels, in_channels // 8, padding=1, act="swish")
152
+
153
+ self.conv1x1 = ConvBNLayer(in_channels // 8, dims, kernel_size=1, act="swish")
154
+ self.out_channels = dims
155
+ self.apply(self._init_weights)
156
+
157
+ def _init_weights(self, m):
158
+ # weight initialization
159
+ if isinstance(m, nn.Conv2d):
160
+ nn.init.kaiming_normal_(m.weight, mode="fan_out")
161
+ if m.bias is not None:
162
+ nn.init.zeros_(m.bias)
163
+ elif isinstance(m, nn.BatchNorm2d):
164
+ nn.init.ones_(m.weight)
165
+ nn.init.zeros_(m.bias)
166
+ elif isinstance(m, nn.Linear):
167
+ nn.init.normal_(m.weight, 0, 0.01)
168
+ if m.bias is not None:
169
+ nn.init.zeros_(m.bias)
170
+ elif isinstance(m, nn.ConvTranspose2d):
171
+ nn.init.kaiming_normal_(m.weight, mode="fan_out")
172
+ if m.bias is not None:
173
+ nn.init.zeros_(m.bias)
174
+ elif isinstance(m, nn.LayerNorm):
175
+ nn.init.ones_(m.weight)
176
+ nn.init.zeros_(m.bias)
177
+
178
+ def forward(self, x):
179
+ # for use guide
180
+ if self.use_guide:
181
+ z = x.clone()
182
+ z.stop_gradient = True
183
+ else:
184
+ z = x
185
+ # for short cut
186
+ h = z
187
+ # reduce dim
188
+ z = self.conv1(z)
189
+ z = self.conv2(z)
190
+ # SVTR global block
191
+ B, C, H, W = z.shape
192
+ z = z.flatten(2).permute(0, 2, 1)
193
+
194
+ for blk in self.svtr_block:
195
+ z = blk(z)
196
+
197
+ z = self.norm(z)
198
+ # last stage
199
+ z = z.reshape([-1, H, W, C]).permute(0, 3, 1, 2)
200
+ z = self.conv3(z)
201
+ z = torch.cat((h, z), dim=1)
202
+ z = self.conv1x1(self.conv4(z))
203
+
204
+ return z
205
+
206
+
207
+ if __name__ == "__main__":
208
+ svtrRNN = EncoderWithSVTR(56)
209
+ print(svtrRNN)
text_embedding_module/OCR/ocr_recog/RecCTCHead.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+
3
+
4
+ class CTCHead(nn.Module):
5
+ def __init__(
6
+ self, in_channels, out_channels=6625, fc_decay=0.0004, mid_channels=None, return_feats=False, **kwargs
7
+ ):
8
+ super(CTCHead, self).__init__()
9
+ if mid_channels is None:
10
+ self.fc = nn.Linear(
11
+ in_channels,
12
+ out_channels,
13
+ bias=True,
14
+ )
15
+ else:
16
+ self.fc1 = nn.Linear(
17
+ in_channels,
18
+ mid_channels,
19
+ bias=True,
20
+ )
21
+ self.fc2 = nn.Linear(
22
+ mid_channels,
23
+ out_channels,
24
+ bias=True,
25
+ )
26
+
27
+ self.out_channels = out_channels
28
+ self.mid_channels = mid_channels
29
+ self.return_feats = return_feats
30
+
31
+ def forward(self, x, labels=None):
32
+ if self.mid_channels is None:
33
+ predicts = self.fc(x)
34
+ else:
35
+ x = self.fc1(x)
36
+ predicts = self.fc2(x)
37
+
38
+ if self.return_feats:
39
+ result = {}
40
+ result["ctc"] = predicts
41
+ result["ctc_neck"] = x
42
+ else:
43
+ result = predicts
44
+
45
+ return result
text_embedding_module/OCR/ocr_recog/RecModel.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+
3
+ from .RecCTCHead import CTCHead
4
+ from .RecMv1_enhance import MobileNetV1Enhance
5
+ from .RNN import Im2Im, Im2Seq, SequenceEncoder
6
+
7
+
8
+ backbone_dict = {"MobileNetV1Enhance": MobileNetV1Enhance}
9
+ neck_dict = {"SequenceEncoder": SequenceEncoder, "Im2Seq": Im2Seq, "None": Im2Im}
10
+ head_dict = {"CTCHead": CTCHead}
11
+
12
+
13
+ class RecModel(nn.Module):
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert "in_channels" in config, "in_channels must in model config"
17
+ backbone_type = config.backbone.pop("type")
18
+ assert backbone_type in backbone_dict, f"backbone.type must in {backbone_dict}"
19
+ self.backbone = backbone_dict[backbone_type](config.in_channels, **config.backbone)
20
+
21
+ neck_type = config.neck.pop("type")
22
+ assert neck_type in neck_dict, f"neck.type must in {neck_dict}"
23
+ self.neck = neck_dict[neck_type](self.backbone.out_channels, **config.neck)
24
+
25
+ head_type = config.head.pop("type")
26
+ assert head_type in head_dict, f"head.type must in {head_dict}"
27
+ self.head = head_dict[head_type](self.neck.out_channels, **config.head)
28
+
29
+ self.name = f"RecModel_{backbone_type}_{neck_type}_{head_type}"
30
+
31
+ def load_3rd_state_dict(self, _3rd_name, _state):
32
+ self.backbone.load_3rd_state_dict(_3rd_name, _state)
33
+ self.neck.load_3rd_state_dict(_3rd_name, _state)
34
+ self.head.load_3rd_state_dict(_3rd_name, _state)
35
+
36
+ def forward(self, x):
37
+ import torch
38
+
39
+ x = x.to(torch.float32)
40
+ x = self.backbone(x)
41
+ x = self.neck(x)
42
+ x = self.head(x)
43
+ return x
44
+
45
+ def encode(self, x):
46
+ x = self.backbone(x)
47
+ x = self.neck(x)
48
+ x = self.head.ctc_encoder(x)
49
+ return x
text_embedding_module/OCR/ocr_recog/RecMv1_enhance.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ from .common import Activation
6
+
7
+
8
+ class ConvBNLayer(nn.Module):
9
+ def __init__(
10
+ self, num_channels, filter_size, num_filters, stride, padding, channels=None, num_groups=1, act="hard_swish"
11
+ ):
12
+ super(ConvBNLayer, self).__init__()
13
+ self.act = act
14
+ self._conv = nn.Conv2d(
15
+ in_channels=num_channels,
16
+ out_channels=num_filters,
17
+ kernel_size=filter_size,
18
+ stride=stride,
19
+ padding=padding,
20
+ groups=num_groups,
21
+ bias=False,
22
+ )
23
+
24
+ self._batch_norm = nn.BatchNorm2d(
25
+ num_filters,
26
+ )
27
+ if self.act is not None:
28
+ self._act = Activation(act_type=act, inplace=True)
29
+
30
+ def forward(self, inputs):
31
+ y = self._conv(inputs)
32
+ y = self._batch_norm(y)
33
+ if self.act is not None:
34
+ y = self._act(y)
35
+ return y
36
+
37
+
38
+ class DepthwiseSeparable(nn.Module):
39
+ def __init__(
40
+ self, num_channels, num_filters1, num_filters2, num_groups, stride, scale, dw_size=3, padding=1, use_se=False
41
+ ):
42
+ super(DepthwiseSeparable, self).__init__()
43
+ self.use_se = use_se
44
+ self._depthwise_conv = ConvBNLayer(
45
+ num_channels=num_channels,
46
+ num_filters=int(num_filters1 * scale),
47
+ filter_size=dw_size,
48
+ stride=stride,
49
+ padding=padding,
50
+ num_groups=int(num_groups * scale),
51
+ )
52
+ if use_se:
53
+ self._se = SEModule(int(num_filters1 * scale))
54
+ self._pointwise_conv = ConvBNLayer(
55
+ num_channels=int(num_filters1 * scale),
56
+ filter_size=1,
57
+ num_filters=int(num_filters2 * scale),
58
+ stride=1,
59
+ padding=0,
60
+ )
61
+
62
+ def forward(self, inputs):
63
+ y = self._depthwise_conv(inputs)
64
+ if self.use_se:
65
+ y = self._se(y)
66
+ y = self._pointwise_conv(y)
67
+ return y
68
+
69
+
70
+ class MobileNetV1Enhance(nn.Module):
71
+ def __init__(self, in_channels=3, scale=0.5, last_conv_stride=1, last_pool_type="max", **kwargs):
72
+ super().__init__()
73
+ self.scale = scale
74
+ self.block_list = []
75
+
76
+ self.conv1 = ConvBNLayer(
77
+ num_channels=in_channels, filter_size=3, channels=3, num_filters=int(32 * scale), stride=2, padding=1
78
+ )
79
+
80
+ conv2_1 = DepthwiseSeparable(
81
+ num_channels=int(32 * scale), num_filters1=32, num_filters2=64, num_groups=32, stride=1, scale=scale
82
+ )
83
+ self.block_list.append(conv2_1)
84
+
85
+ conv2_2 = DepthwiseSeparable(
86
+ num_channels=int(64 * scale), num_filters1=64, num_filters2=128, num_groups=64, stride=1, scale=scale
87
+ )
88
+ self.block_list.append(conv2_2)
89
+
90
+ conv3_1 = DepthwiseSeparable(
91
+ num_channels=int(128 * scale), num_filters1=128, num_filters2=128, num_groups=128, stride=1, scale=scale
92
+ )
93
+ self.block_list.append(conv3_1)
94
+
95
+ conv3_2 = DepthwiseSeparable(
96
+ num_channels=int(128 * scale),
97
+ num_filters1=128,
98
+ num_filters2=256,
99
+ num_groups=128,
100
+ stride=(2, 1),
101
+ scale=scale,
102
+ )
103
+ self.block_list.append(conv3_2)
104
+
105
+ conv4_1 = DepthwiseSeparable(
106
+ num_channels=int(256 * scale), num_filters1=256, num_filters2=256, num_groups=256, stride=1, scale=scale
107
+ )
108
+ self.block_list.append(conv4_1)
109
+
110
+ conv4_2 = DepthwiseSeparable(
111
+ num_channels=int(256 * scale),
112
+ num_filters1=256,
113
+ num_filters2=512,
114
+ num_groups=256,
115
+ stride=(2, 1),
116
+ scale=scale,
117
+ )
118
+ self.block_list.append(conv4_2)
119
+
120
+ for _ in range(5):
121
+ conv5 = DepthwiseSeparable(
122
+ num_channels=int(512 * scale),
123
+ num_filters1=512,
124
+ num_filters2=512,
125
+ num_groups=512,
126
+ stride=1,
127
+ dw_size=5,
128
+ padding=2,
129
+ scale=scale,
130
+ use_se=False,
131
+ )
132
+ self.block_list.append(conv5)
133
+
134
+ conv5_6 = DepthwiseSeparable(
135
+ num_channels=int(512 * scale),
136
+ num_filters1=512,
137
+ num_filters2=1024,
138
+ num_groups=512,
139
+ stride=(2, 1),
140
+ dw_size=5,
141
+ padding=2,
142
+ scale=scale,
143
+ use_se=True,
144
+ )
145
+ self.block_list.append(conv5_6)
146
+
147
+ conv6 = DepthwiseSeparable(
148
+ num_channels=int(1024 * scale),
149
+ num_filters1=1024,
150
+ num_filters2=1024,
151
+ num_groups=1024,
152
+ stride=last_conv_stride,
153
+ dw_size=5,
154
+ padding=2,
155
+ use_se=True,
156
+ scale=scale,
157
+ )
158
+ self.block_list.append(conv6)
159
+
160
+ self.block_list = nn.Sequential(*self.block_list)
161
+ if last_pool_type == "avg":
162
+ self.pool = nn.AvgPool2d(kernel_size=2, stride=2, padding=0)
163
+ else:
164
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
165
+ self.out_channels = int(1024 * scale)
166
+
167
+ def forward(self, inputs):
168
+ y = self.conv1(inputs)
169
+ y = self.block_list(y)
170
+ y = self.pool(y)
171
+ return y
172
+
173
+
174
+ def hardsigmoid(x):
175
+ return F.relu6(x + 3.0, inplace=True) / 6.0
176
+
177
+
178
+ class SEModule(nn.Module):
179
+ def __init__(self, channel, reduction=4):
180
+ super(SEModule, self).__init__()
181
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
182
+ self.conv1 = nn.Conv2d(
183
+ in_channels=channel, out_channels=channel // reduction, kernel_size=1, stride=1, padding=0, bias=True
184
+ )
185
+ self.conv2 = nn.Conv2d(
186
+ in_channels=channel // reduction, out_channels=channel, kernel_size=1, stride=1, padding=0, bias=True
187
+ )
188
+
189
+ def forward(self, inputs):
190
+ outputs = self.avg_pool(inputs)
191
+ outputs = self.conv1(outputs)
192
+ outputs = F.relu(outputs)
193
+ outputs = self.conv2(outputs)
194
+ outputs = hardsigmoid(outputs)
195
+ x = torch.mul(inputs, outputs)
196
+
197
+ return x
text_embedding_module/OCR/ocr_recog/RecSVTR.py ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import functional
5
+ from torch.nn.init import ones_, trunc_normal_, zeros_
6
+
7
+
8
+ def drop_path(x, drop_prob=0.0, training=False):
9
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
10
+ the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
11
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
12
+ """
13
+ if drop_prob == 0.0 or not training:
14
+ return x
15
+ keep_prob = torch.tensor(1 - drop_prob)
16
+ shape = (x.size()[0],) + (1,) * (x.ndim - 1)
17
+ random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype)
18
+ random_tensor = torch.floor(random_tensor) # binarize
19
+ output = x.divide(keep_prob) * random_tensor
20
+ return output
21
+
22
+
23
+ class Swish(nn.Module):
24
+ def __int__(self):
25
+ super(Swish, self).__int__()
26
+
27
+ def forward(self, x):
28
+ return x * torch.sigmoid(x)
29
+
30
+
31
+ class ConvBNLayer(nn.Module):
32
+ def __init__(
33
+ self, in_channels, out_channels, kernel_size=3, stride=1, padding=0, bias_attr=False, groups=1, act=nn.GELU
34
+ ):
35
+ super().__init__()
36
+ self.conv = nn.Conv2d(
37
+ in_channels=in_channels,
38
+ out_channels=out_channels,
39
+ kernel_size=kernel_size,
40
+ stride=stride,
41
+ padding=padding,
42
+ groups=groups,
43
+ # weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()),
44
+ bias=bias_attr,
45
+ )
46
+ self.norm = nn.BatchNorm2d(out_channels)
47
+ self.act = act()
48
+
49
+ def forward(self, inputs):
50
+ out = self.conv(inputs)
51
+ out = self.norm(out)
52
+ out = self.act(out)
53
+ return out
54
+
55
+
56
+ class DropPath(nn.Module):
57
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
58
+
59
+ def __init__(self, drop_prob=None):
60
+ super(DropPath, self).__init__()
61
+ self.drop_prob = drop_prob
62
+
63
+ def forward(self, x):
64
+ return drop_path(x, self.drop_prob, self.training)
65
+
66
+
67
+ class Identity(nn.Module):
68
+ def __init__(self):
69
+ super(Identity, self).__init__()
70
+
71
+ def forward(self, input):
72
+ return input
73
+
74
+
75
+ class Mlp(nn.Module):
76
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0):
77
+ super().__init__()
78
+ out_features = out_features or in_features
79
+ hidden_features = hidden_features or in_features
80
+ self.fc1 = nn.Linear(in_features, hidden_features)
81
+ if isinstance(act_layer, str):
82
+ self.act = Swish()
83
+ else:
84
+ self.act = act_layer()
85
+ self.fc2 = nn.Linear(hidden_features, out_features)
86
+ self.drop = nn.Dropout(drop)
87
+
88
+ def forward(self, x):
89
+ x = self.fc1(x)
90
+ x = self.act(x)
91
+ x = self.drop(x)
92
+ x = self.fc2(x)
93
+ x = self.drop(x)
94
+ return x
95
+
96
+
97
+ class ConvMixer(nn.Module):
98
+ def __init__(
99
+ self,
100
+ dim,
101
+ num_heads=8,
102
+ HW=(8, 25),
103
+ local_k=(3, 3),
104
+ ):
105
+ super().__init__()
106
+ self.HW = HW
107
+ self.dim = dim
108
+ self.local_mixer = nn.Conv2d(
109
+ dim,
110
+ dim,
111
+ local_k,
112
+ 1,
113
+ (local_k[0] // 2, local_k[1] // 2),
114
+ groups=num_heads,
115
+ # weight_attr=ParamAttr(initializer=KaimingNormal())
116
+ )
117
+
118
+ def forward(self, x):
119
+ h = self.HW[0]
120
+ w = self.HW[1]
121
+ x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w])
122
+ x = self.local_mixer(x)
123
+ x = x.flatten(2).transpose([0, 2, 1])
124
+ return x
125
+
126
+
127
+ class Attention(nn.Module):
128
+ def __init__(
129
+ self,
130
+ dim,
131
+ num_heads=8,
132
+ mixer="Global",
133
+ HW=(8, 25),
134
+ local_k=(7, 11),
135
+ qkv_bias=False,
136
+ qk_scale=None,
137
+ attn_drop=0.0,
138
+ proj_drop=0.0,
139
+ ):
140
+ super().__init__()
141
+ self.num_heads = num_heads
142
+ head_dim = dim // num_heads
143
+ self.scale = qk_scale or head_dim**-0.5
144
+
145
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
146
+ self.attn_drop = nn.Dropout(attn_drop)
147
+ self.proj = nn.Linear(dim, dim)
148
+ self.proj_drop = nn.Dropout(proj_drop)
149
+ self.HW = HW
150
+ if HW is not None:
151
+ H = HW[0]
152
+ W = HW[1]
153
+ self.N = H * W
154
+ self.C = dim
155
+ if mixer == "Local" and HW is not None:
156
+ hk = local_k[0]
157
+ wk = local_k[1]
158
+ mask = torch.ones([H * W, H + hk - 1, W + wk - 1])
159
+ for h in range(0, H):
160
+ for w in range(0, W):
161
+ mask[h * W + w, h : h + hk, w : w + wk] = 0.0
162
+ mask_paddle = mask[:, hk // 2 : H + hk // 2, wk // 2 : W + wk // 2].flatten(1)
163
+ mask_inf = torch.full([H * W, H * W], fill_value=float("-inf"))
164
+ mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf)
165
+ self.mask = mask[None, None, :]
166
+ # self.mask = mask.unsqueeze([0, 1])
167
+ self.mixer = mixer
168
+
169
+ def forward(self, x):
170
+ if self.HW is not None:
171
+ N = self.N
172
+ C = self.C
173
+ else:
174
+ _, N, C = x.shape
175
+ qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // self.num_heads)).permute((2, 0, 3, 1, 4))
176
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
177
+
178
+ attn = q.matmul(k.permute((0, 1, 3, 2)))
179
+ if self.mixer == "Local":
180
+ attn += self.mask
181
+ attn = functional.softmax(attn, dim=-1)
182
+ attn = self.attn_drop(attn)
183
+
184
+ x = (attn.matmul(v)).permute((0, 2, 1, 3)).reshape((-1, N, C))
185
+ x = self.proj(x)
186
+ x = self.proj_drop(x)
187
+ return x
188
+
189
+
190
+ class Block(nn.Module):
191
+ def __init__(
192
+ self,
193
+ dim,
194
+ num_heads,
195
+ mixer="Global",
196
+ local_mixer=(7, 11),
197
+ HW=(8, 25),
198
+ mlp_ratio=4.0,
199
+ qkv_bias=False,
200
+ qk_scale=None,
201
+ drop=0.0,
202
+ attn_drop=0.0,
203
+ drop_path=0.0,
204
+ act_layer=nn.GELU,
205
+ norm_layer="nn.LayerNorm",
206
+ epsilon=1e-6,
207
+ prenorm=True,
208
+ ):
209
+ super().__init__()
210
+ if isinstance(norm_layer, str):
211
+ self.norm1 = eval(norm_layer)(dim, eps=epsilon)
212
+ else:
213
+ self.norm1 = norm_layer(dim)
214
+ if mixer == "Global" or mixer == "Local":
215
+ self.mixer = Attention(
216
+ dim,
217
+ num_heads=num_heads,
218
+ mixer=mixer,
219
+ HW=HW,
220
+ local_k=local_mixer,
221
+ qkv_bias=qkv_bias,
222
+ qk_scale=qk_scale,
223
+ attn_drop=attn_drop,
224
+ proj_drop=drop,
225
+ )
226
+ elif mixer == "Conv":
227
+ self.mixer = ConvMixer(dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
228
+ else:
229
+ raise TypeError("The mixer must be one of [Global, Local, Conv]")
230
+
231
+ self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
232
+ if isinstance(norm_layer, str):
233
+ self.norm2 = eval(norm_layer)(dim, eps=epsilon)
234
+ else:
235
+ self.norm2 = norm_layer(dim)
236
+ mlp_hidden_dim = int(dim * mlp_ratio)
237
+ self.mlp_ratio = mlp_ratio
238
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
239
+ self.prenorm = prenorm
240
+
241
+ def forward(self, x):
242
+ if self.prenorm:
243
+ x = self.norm1(x + self.drop_path(self.mixer(x)))
244
+ x = self.norm2(x + self.drop_path(self.mlp(x)))
245
+ else:
246
+ x = x + self.drop_path(self.mixer(self.norm1(x)))
247
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
248
+ return x
249
+
250
+
251
+ class PatchEmbed(nn.Module):
252
+ """Image to Patch Embedding"""
253
+
254
+ def __init__(self, img_size=(32, 100), in_channels=3, embed_dim=768, sub_num=2):
255
+ super().__init__()
256
+ num_patches = (img_size[1] // (2**sub_num)) * (img_size[0] // (2**sub_num))
257
+ self.img_size = img_size
258
+ self.num_patches = num_patches
259
+ self.embed_dim = embed_dim
260
+ self.norm = None
261
+ if sub_num == 2:
262
+ self.proj = nn.Sequential(
263
+ ConvBNLayer(
264
+ in_channels=in_channels,
265
+ out_channels=embed_dim // 2,
266
+ kernel_size=3,
267
+ stride=2,
268
+ padding=1,
269
+ act=nn.GELU,
270
+ bias_attr=False,
271
+ ),
272
+ ConvBNLayer(
273
+ in_channels=embed_dim // 2,
274
+ out_channels=embed_dim,
275
+ kernel_size=3,
276
+ stride=2,
277
+ padding=1,
278
+ act=nn.GELU,
279
+ bias_attr=False,
280
+ ),
281
+ )
282
+ if sub_num == 3:
283
+ self.proj = nn.Sequential(
284
+ ConvBNLayer(
285
+ in_channels=in_channels,
286
+ out_channels=embed_dim // 4,
287
+ kernel_size=3,
288
+ stride=2,
289
+ padding=1,
290
+ act=nn.GELU,
291
+ bias_attr=False,
292
+ ),
293
+ ConvBNLayer(
294
+ in_channels=embed_dim // 4,
295
+ out_channels=embed_dim // 2,
296
+ kernel_size=3,
297
+ stride=2,
298
+ padding=1,
299
+ act=nn.GELU,
300
+ bias_attr=False,
301
+ ),
302
+ ConvBNLayer(
303
+ in_channels=embed_dim // 2,
304
+ out_channels=embed_dim,
305
+ kernel_size=3,
306
+ stride=2,
307
+ padding=1,
308
+ act=nn.GELU,
309
+ bias_attr=False,
310
+ ),
311
+ )
312
+
313
+ def forward(self, x):
314
+ B, C, H, W = x.shape
315
+ assert (
316
+ H == self.img_size[0] and W == self.img_size[1]
317
+ ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
318
+ x = self.proj(x).flatten(2).permute(0, 2, 1)
319
+ return x
320
+
321
+
322
+ class SubSample(nn.Module):
323
+ def __init__(self, in_channels, out_channels, types="Pool", stride=(2, 1), sub_norm="nn.LayerNorm", act=None):
324
+ super().__init__()
325
+ self.types = types
326
+ if types == "Pool":
327
+ self.avgpool = nn.AvgPool2d(kernel_size=(3, 5), stride=stride, padding=(1, 2))
328
+ self.maxpool = nn.MaxPool2d(kernel_size=(3, 5), stride=stride, padding=(1, 2))
329
+ self.proj = nn.Linear(in_channels, out_channels)
330
+ else:
331
+ self.conv = nn.Conv2d(
332
+ in_channels,
333
+ out_channels,
334
+ kernel_size=3,
335
+ stride=stride,
336
+ padding=1,
337
+ # weight_attr=ParamAttr(initializer=KaimingNormal())
338
+ )
339
+ self.norm = eval(sub_norm)(out_channels)
340
+ if act is not None:
341
+ self.act = act()
342
+ else:
343
+ self.act = None
344
+
345
+ def forward(self, x):
346
+ if self.types == "Pool":
347
+ x1 = self.avgpool(x)
348
+ x2 = self.maxpool(x)
349
+ x = (x1 + x2) * 0.5
350
+ out = self.proj(x.flatten(2).permute((0, 2, 1)))
351
+ else:
352
+ x = self.conv(x)
353
+ out = x.flatten(2).permute((0, 2, 1))
354
+ out = self.norm(out)
355
+ if self.act is not None:
356
+ out = self.act(out)
357
+
358
+ return out
359
+
360
+
361
+ class SVTRNet(nn.Module):
362
+ def __init__(
363
+ self,
364
+ img_size=[48, 100],
365
+ in_channels=3,
366
+ embed_dim=[64, 128, 256],
367
+ depth=[3, 6, 3],
368
+ num_heads=[2, 4, 8],
369
+ mixer=["Local"] * 6 + ["Global"] * 6, # Local atten, Global atten, Conv
370
+ local_mixer=[[7, 11], [7, 11], [7, 11]],
371
+ patch_merging="Conv", # Conv, Pool, None
372
+ mlp_ratio=4,
373
+ qkv_bias=True,
374
+ qk_scale=None,
375
+ drop_rate=0.0,
376
+ last_drop=0.1,
377
+ attn_drop_rate=0.0,
378
+ drop_path_rate=0.1,
379
+ norm_layer="nn.LayerNorm",
380
+ sub_norm="nn.LayerNorm",
381
+ epsilon=1e-6,
382
+ out_channels=192,
383
+ out_char_num=25,
384
+ block_unit="Block",
385
+ act="nn.GELU",
386
+ last_stage=True,
387
+ sub_num=2,
388
+ prenorm=True,
389
+ use_lenhead=False,
390
+ **kwargs,
391
+ ):
392
+ super().__init__()
393
+ self.img_size = img_size
394
+ self.embed_dim = embed_dim
395
+ self.out_channels = out_channels
396
+ self.prenorm = prenorm
397
+ patch_merging = None if patch_merging != "Conv" and patch_merging != "Pool" else patch_merging
398
+ self.patch_embed = PatchEmbed(
399
+ img_size=img_size, in_channels=in_channels, embed_dim=embed_dim[0], sub_num=sub_num
400
+ )
401
+ num_patches = self.patch_embed.num_patches
402
+ self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
403
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0]))
404
+ # self.pos_embed = self.create_parameter(
405
+ # shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_)
406
+
407
+ # self.add_parameter("pos_embed", self.pos_embed)
408
+
409
+ self.pos_drop = nn.Dropout(p=drop_rate)
410
+ Block_unit = eval(block_unit)
411
+
412
+ dpr = np.linspace(0, drop_path_rate, sum(depth))
413
+ self.blocks1 = nn.ModuleList(
414
+ [
415
+ Block_unit(
416
+ dim=embed_dim[0],
417
+ num_heads=num_heads[0],
418
+ mixer=mixer[0 : depth[0]][i],
419
+ HW=self.HW,
420
+ local_mixer=local_mixer[0],
421
+ mlp_ratio=mlp_ratio,
422
+ qkv_bias=qkv_bias,
423
+ qk_scale=qk_scale,
424
+ drop=drop_rate,
425
+ act_layer=eval(act),
426
+ attn_drop=attn_drop_rate,
427
+ drop_path=dpr[0 : depth[0]][i],
428
+ norm_layer=norm_layer,
429
+ epsilon=epsilon,
430
+ prenorm=prenorm,
431
+ )
432
+ for i in range(depth[0])
433
+ ]
434
+ )
435
+ if patch_merging is not None:
436
+ self.sub_sample1 = SubSample(
437
+ embed_dim[0], embed_dim[1], sub_norm=sub_norm, stride=[2, 1], types=patch_merging
438
+ )
439
+ HW = [self.HW[0] // 2, self.HW[1]]
440
+ else:
441
+ HW = self.HW
442
+ self.patch_merging = patch_merging
443
+ self.blocks2 = nn.ModuleList(
444
+ [
445
+ Block_unit(
446
+ dim=embed_dim[1],
447
+ num_heads=num_heads[1],
448
+ mixer=mixer[depth[0] : depth[0] + depth[1]][i],
449
+ HW=HW,
450
+ local_mixer=local_mixer[1],
451
+ mlp_ratio=mlp_ratio,
452
+ qkv_bias=qkv_bias,
453
+ qk_scale=qk_scale,
454
+ drop=drop_rate,
455
+ act_layer=eval(act),
456
+ attn_drop=attn_drop_rate,
457
+ drop_path=dpr[depth[0] : depth[0] + depth[1]][i],
458
+ norm_layer=norm_layer,
459
+ epsilon=epsilon,
460
+ prenorm=prenorm,
461
+ )
462
+ for i in range(depth[1])
463
+ ]
464
+ )
465
+ if patch_merging is not None:
466
+ self.sub_sample2 = SubSample(
467
+ embed_dim[1], embed_dim[2], sub_norm=sub_norm, stride=[2, 1], types=patch_merging
468
+ )
469
+ HW = [self.HW[0] // 4, self.HW[1]]
470
+ else:
471
+ HW = self.HW
472
+ self.blocks3 = nn.ModuleList(
473
+ [
474
+ Block_unit(
475
+ dim=embed_dim[2],
476
+ num_heads=num_heads[2],
477
+ mixer=mixer[depth[0] + depth[1] :][i],
478
+ HW=HW,
479
+ local_mixer=local_mixer[2],
480
+ mlp_ratio=mlp_ratio,
481
+ qkv_bias=qkv_bias,
482
+ qk_scale=qk_scale,
483
+ drop=drop_rate,
484
+ act_layer=eval(act),
485
+ attn_drop=attn_drop_rate,
486
+ drop_path=dpr[depth[0] + depth[1] :][i],
487
+ norm_layer=norm_layer,
488
+ epsilon=epsilon,
489
+ prenorm=prenorm,
490
+ )
491
+ for i in range(depth[2])
492
+ ]
493
+ )
494
+ self.last_stage = last_stage
495
+ if last_stage:
496
+ self.avg_pool = nn.AdaptiveAvgPool2d((1, out_char_num))
497
+ self.last_conv = nn.Conv2d(
498
+ in_channels=embed_dim[2],
499
+ out_channels=self.out_channels,
500
+ kernel_size=1,
501
+ stride=1,
502
+ padding=0,
503
+ bias=False,
504
+ )
505
+ self.hardswish = nn.Hardswish()
506
+ self.dropout = nn.Dropout(p=last_drop)
507
+ if not prenorm:
508
+ self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon)
509
+ self.use_lenhead = use_lenhead
510
+ if use_lenhead:
511
+ self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
512
+ self.hardswish_len = nn.Hardswish()
513
+ self.dropout_len = nn.Dropout(p=last_drop)
514
+
515
+ trunc_normal_(self.pos_embed, std=0.02)
516
+ self.apply(self._init_weights)
517
+
518
+ def _init_weights(self, m):
519
+ if isinstance(m, nn.Linear):
520
+ trunc_normal_(m.weight, std=0.02)
521
+ if isinstance(m, nn.Linear) and m.bias is not None:
522
+ zeros_(m.bias)
523
+ elif isinstance(m, nn.LayerNorm):
524
+ zeros_(m.bias)
525
+ ones_(m.weight)
526
+
527
+ def forward_features(self, x):
528
+ x = self.patch_embed(x)
529
+ x = x + self.pos_embed
530
+ x = self.pos_drop(x)
531
+ for blk in self.blocks1:
532
+ x = blk(x)
533
+ if self.patch_merging is not None:
534
+ x = self.sub_sample1(x.permute([0, 2, 1]).reshape([-1, self.embed_dim[0], self.HW[0], self.HW[1]]))
535
+ for blk in self.blocks2:
536
+ x = blk(x)
537
+ if self.patch_merging is not None:
538
+ x = self.sub_sample2(x.permute([0, 2, 1]).reshape([-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]]))
539
+ for blk in self.blocks3:
540
+ x = blk(x)
541
+ if not self.prenorm:
542
+ x = self.norm(x)
543
+ return x
544
+
545
+ def forward(self, x):
546
+ x = self.forward_features(x)
547
+ if self.use_lenhead:
548
+ len_x = self.len_conv(x.mean(1))
549
+ len_x = self.dropout_len(self.hardswish_len(len_x))
550
+ if self.last_stage:
551
+ if self.patch_merging is not None:
552
+ h = self.HW[0] // 4
553
+ else:
554
+ h = self.HW[0]
555
+ x = self.avg_pool(x.permute([0, 2, 1]).reshape([-1, self.embed_dim[2], h, self.HW[1]]))
556
+ x = self.last_conv(x)
557
+ x = self.hardswish(x)
558
+ x = self.dropout(x)
559
+ if self.use_lenhead:
560
+ return x, len_x
561
+ return x
562
+
563
+
564
+ if __name__ == "__main__":
565
+ a = torch.rand(1, 3, 48, 100)
566
+ svtr = SVTRNet()
567
+
568
+ out = svtr(a)
569
+ print(svtr)
570
+ print(out.size())
text_embedding_module/OCR/ocr_recog/common.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+
6
+ class Hswish(nn.Module):
7
+ def __init__(self, inplace=True):
8
+ super(Hswish, self).__init__()
9
+ self.inplace = inplace
10
+
11
+ def forward(self, x):
12
+ return x * F.relu6(x + 3.0, inplace=self.inplace) / 6.0
13
+
14
+
15
+ # out = max(0, min(1, slop*x+offset))
16
+ # paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None)
17
+ class Hsigmoid(nn.Module):
18
+ def __init__(self, inplace=True):
19
+ super(Hsigmoid, self).__init__()
20
+ self.inplace = inplace
21
+
22
+ def forward(self, x):
23
+ # torch: F.relu6(x + 3., inplace=self.inplace) / 6.
24
+ # paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6.
25
+ return F.relu6(1.2 * x + 3.0, inplace=self.inplace) / 6.0
26
+
27
+
28
+ class GELU(nn.Module):
29
+ def __init__(self, inplace=True):
30
+ super(GELU, self).__init__()
31
+ self.inplace = inplace
32
+
33
+ def forward(self, x):
34
+ return torch.nn.functional.gelu(x)
35
+
36
+
37
+ class Swish(nn.Module):
38
+ def __init__(self, inplace=True):
39
+ super(Swish, self).__init__()
40
+ self.inplace = inplace
41
+
42
+ def forward(self, x):
43
+ if self.inplace:
44
+ x.mul_(torch.sigmoid(x))
45
+ return x
46
+ else:
47
+ return x * torch.sigmoid(x)
48
+
49
+
50
+ class Activation(nn.Module):
51
+ def __init__(self, act_type, inplace=True):
52
+ super(Activation, self).__init__()
53
+ act_type = act_type.lower()
54
+ if act_type == "relu":
55
+ self.act = nn.ReLU(inplace=inplace)
56
+ elif act_type == "relu6":
57
+ self.act = nn.ReLU6(inplace=inplace)
58
+ elif act_type == "sigmoid":
59
+ raise NotImplementedError
60
+ elif act_type == "hard_sigmoid":
61
+ self.act = Hsigmoid(inplace)
62
+ elif act_type == "hard_swish":
63
+ self.act = Hswish(inplace=inplace)
64
+ elif act_type == "leakyrelu":
65
+ self.act = nn.LeakyReLU(inplace=inplace)
66
+ elif act_type == "gelu":
67
+ self.act = GELU(inplace=inplace)
68
+ elif act_type == "swish":
69
+ self.act = Swish(inplace=inplace)
70
+ else:
71
+ raise NotImplementedError
72
+
73
+ def forward(self, inputs):
74
+ return self.act(inputs)
text_embedding_module/OCR/ocr_recog/en_dict.txt ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0
2
+ 1
3
+ 2
4
+ 3
5
+ 4
6
+ 5
7
+ 6
8
+ 7
9
+ 8
10
+ 9
11
+ :
12
+ ;
13
+ <
14
+ =
15
+ >
16
+ ?
17
+ @
18
+ A
19
+ B
20
+ C
21
+ D
22
+ E
23
+ F
24
+ G
25
+ H
26
+ I
27
+ J
28
+ K
29
+ L
30
+ M
31
+ N
32
+ O
33
+ P
34
+ Q
35
+ R
36
+ S
37
+ T
38
+ U
39
+ V
40
+ W
41
+ X
42
+ Y
43
+ Z
44
+ [
45
+ \
46
+ ]
47
+ ^
48
+ _
49
+ `
50
+ a
51
+ b
52
+ c
53
+ d
54
+ e
55
+ f
56
+ g
57
+ h
58
+ i
59
+ j
60
+ k
61
+ l
62
+ m
63
+ n
64
+ o
65
+ p
66
+ q
67
+ r
68
+ s
69
+ t
70
+ u
71
+ v
72
+ w
73
+ x
74
+ y
75
+ z
76
+ {
77
+ |
78
+ }
79
+ ~
80
+ !
81
+ "
82
+ #
83
+ $
84
+ %
85
+ &
86
+ '
87
+ (
88
+ )
89
+ *
90
+ +
91
+ ,
92
+ -
93
+ .
94
+ /
95
+