/td>
+ _0 = "[零]"
|
|
86
|
+ _1_9 = "[一二三四五六七八九]"
|
|
87
|
+ _10_99 = f"{_1_9}?[十]{_1_9}?"
|
|
88
|
+ _1_99 = f"({_10_99}|{_1_9})"
|
|
89
|
+ _100_999 = f"({_1_9}[百]([零]{_1_9})?|{_1_9}[百]{_10_99})"
|
|
90
|
+ _1_999 = f"({_100_999}|{_1_99})"
|
|
91
|
+ _1000_9999 = f"({_1_9}[千]([零]{_1_99})?|{_1_9}[千]{_100_999})"
|
|
92
|
+ _1_9999 = f"({_1000_9999}|{_1_999})"
|
|
93
|
+ _10000_99999999 = f"({_1_9999}[万]([零]{_1_999})?|{_1_9999}[万]{_1000_9999})"
|
|
94
|
+ _1_99999999 = f"({_10000_99999999}|{_1_9999})"
|
|
95
|
+ _100000000_9999999999999999 = f"({_1_99999999}[亿]([零]{_1_99999999})?|{_1_99999999}[亿]{_10000_99999999})"
|
|
96
|
+ _1_9999999999999999 = f"({_100000000_9999999999999999}|{_1_99999999})"
|
|
97
|
+ str_int_pattern = f"^({_0}|{_1_9999999999999999})$"
|
|
98
|
+ nor_int_pattern = f"^({_0}|{_1_9999999999999999})$"
|
|
99
|
+
|
|
100
|
+ str_dec_pattern = "^[零一二三四五六七八九]{0,15}[一二三四五六七八九]$"
|
|
101
|
+ nor_dec_pattern = "^[零一二三四五六七八九]{0,16}$"
|
|
102
|
+
|
|
103
|
+ for str_num in self.strict_cn_number.keys():
|
|
104
|
+ str_int_pattern = str_int_pattern.replace(str_num, self.strict_cn_number[str_num])
|
|
105
|
+ str_dec_pattern = str_dec_pattern.replace(str_num, self.strict_cn_number[str_num])
|
|
106
|
+ for nor_num in self.normal_cn_number.keys():
|
|
107
|
+ nor_int_pattern = nor_int_pattern.replace(nor_num, self.normal_cn_number[nor_num])
|
|
108
|
+ nor_dec_pattern = nor_dec_pattern.replace(nor_num, self.normal_cn_number[nor_num])
|
|
109
|
+
|
|
110
|
+ pattern_dict = {
|
|
111
|
+ "strict": {
|
|
112
|
+ "int": re.compile(str_int_pattern),
|
|
113
|
+ "dec": re.compile(str_dec_pattern)
|
|
114
|
+ },
|
|
115
|
+ "normal": {
|
|
116
|
+ "int": re.compile(nor_int_pattern),
|
|
117
|
+ "dec": re.compile(nor_dec_pattern)
|
|
118
|
+ }
|
|
119
|
+ }
|
|
120
|
+ return pattern_dict
|
|
121
|
+
|
|
122
|
+ def __copy_num(self, num):
|
|
123
|
+ cn_num = ""
|
|
124
|
+ for n in num:
|
|
125
|
+ cn_num += NUMBER_LOW_AN2CN[int(n)]
|
|
126
|
+ return cn_num
|
|
127
|
+
|
|
128
|
+ def __check_input_data_is_valid(self, check_data: str, mode: str) -> (int, str, str, bool):
|
|
129
|
+ # 去除 元整、圆整、元正、圆正
|
|
130
|
+ stop_words = ["元整", "圆整", "元正", "圆正"]
|
|
131
|
+ for word in stop_words:
|
|
132
|
+ if check_data[-2:] == word:
|
|
133
|
+ check_data = check_data[:-2]
|
|
134
|
+
|
|
135
|
+ # 去除 元、圆
|
|
136
|
+ if mode != "strict":
|
|
137
|
+ normal_stop_words = ["圆", "元"]
|
|
138
|
+ for word in normal_stop_words:
|
|
139
|
+ if check_data[-1] == word:
|
|
140
|
+ check_data = check_data[:-1]
|
|
141
|
+
|
|
142
|
+ # 处理元角分
|
|
143
|
+ result = self.yjf_pattern.search(check_data)
|
|
144
|
+ if result:
|
|
145
|
+ check_data = check_data.replace("元", "点").replace("角", "").replace("分", "")
|
|
146
|
+
|
|
147
|
+ # 处理特殊问法:一千零十一 一万零百一十一
|
|
148
|
+ if "零十" in check_data:
|
|
149
|
+ check_data = check_data.replace("零十", "零一十")
|
|
150
|
+ if "零百" in check_data:
|
|
151
|
+ check_data = check_data.replace("零百", "零一百")
|
|
152
|
+
|
|
153
|
+ for data in check_data:
|
|
154
|
+ if data not in self.check_key_dict[mode]:
|
|
155
|
+ raise ValueError(f"当前为{mode}模式,输入的数据不在转化范围内:{data}!")
|
|
156
|
+
|
|
157
|
+ # 确定正负号
|
|
158
|
+ if check_data[0] == "负":
|
|
159
|
+ check_data = check_data[1:]
|
|
160
|
+ sign = -1
|
|
161
|
+ else:
|
|
162
|
+ sign = 1
|
|
163
|
+
|
|
164
|
+ if "点" in check_data:
|
|
165
|
+ split_data = check_data.split("点")
|
|
166
|
+ if len(split_data) == 2:
|
|
167
|
+ integer_data, decimal_data = split_data
|
|
168
|
+ # 将 smart 模式中的阿拉伯数字转化成中文数字
|
|
169
|
+ if mode == "smart":
|
|
170
|
+ integer_data = re.sub(r"\d+", lambda x: self.ac.an2cn(x.group()), integer_data)
|
|
171
|
+ decimal_data = re.sub(r"\d+", lambda x: self.__copy_num(x.group()), decimal_data)
|
|
172
|
+ mode = "normal"
|
|
173
|
+ else:
|
|
174
|
+ raise ValueError("数据中包含不止一个点!")
|
|
175
|
+ else:
|
|
176
|
+ integer_data = check_data
|
|
177
|
+ decimal_data = None
|
|
178
|
+ # 将 smart 模式中的阿拉伯数字转化成中文数字
|
|
179
|
+ if mode == "smart":
|
|
180
|
+ # 10.1万 10.1
|
|
181
|
+ result1 = self.pattern1.search(integer_data)
|
|
182
|
+ if result1:
|
|
183
|
+ if result1.group() == integer_data:
|
|
184
|
+ if integer_data[-1] in UNIT_CN2AN.keys():
|
|
185
|
+ output = int(float(integer_data[:-1]) * UNIT_CN2AN[integer_data[-1]])
|
|
186
|
+ else:
|
|
187
|
+ output = float(integer_data)
|
|
188
|
+ return 0, output, None, None
|
|
189
|
+
|
|
190
|
+ integer_data = re.sub(r"\d+", lambda x: self.ac.an2cn(x.group()), integer_data)
|
|
191
|
+ mode = "normal"
|
|
192
|
+
|
|
193
|
+ result_int = self.pattern_dict[mode]["int"].search(integer_data)
|
|
194
|
+ if result_int:
|
|
195
|
+ if result_int.group() == integer_data:
|
|
196
|
+ if decimal_data is not None:
|
|
197
|
+ result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
|
|
198
|
+ if result_dec:
|
|
199
|
+ if result_dec.group() == decimal_data:
|
|
200
|
+ return sign, integer_data, decimal_data, False
|
|
201
|
+ else:
|
|
202
|
+ return sign, integer_data, decimal_data, False
|
|
203
|
+ else:
|
|
204
|
+ if mode == "strict":
|
|
205
|
+ raise ValueError(f"不符合格式的数据:{integer_data}")
|
|
206
|
+ elif mode == "normal":
|
|
207
|
+ # 纯数模式:一二三
|
|
208
|
+ result_all_num = self.ptn_all_num.search(integer_data)
|
|
209
|
+ if result_all_num:
|
|
210
|
+ if result_all_num.group() == integer_data:
|
|
211
|
+ if decimal_data is not None:
|
|
212
|
+ result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
|
|
213
|
+ if result_dec:
|
|
214
|
+ if result_dec.group() == decimal_data:
|
|
215
|
+ return sign, integer_data, decimal_data, True
|
|
216
|
+ else:
|
|
217
|
+ return sign, integer_data, decimal_data, True
|
|
218
|
+
|
|
219
|
+ # 口语模式:一万二,两千三,三百四,十三万六,一百二十五万三
|
|
220
|
+ result_speaking_mode = self.ptn_speaking_mode.search(integer_data)
|
|
221
|
+ if len(integer_data) >= 3 and result_speaking_mode and result_speaking_mode.group() == integer_data:
|
|
222
|
+ # len(integer_data)>=3: because the minimum length of integer_data that can be matched is 3
|
|
223
|
+ # to find the last unit
|
|
224
|
+ last_unit = result_speaking_mode.groups()[-1][-1]
|
|
225
|
+ _unit = UNIT_LOW_AN2CN[UNIT_CN2AN[last_unit] // 10]
|
|
226
|
+ integer_data = integer_data + _unit
|
|
227
|
+ if decimal_data is not None:
|
|
228
|
+ result_dec = self.pattern_dict[mode]["dec"].search(decimal_data)
|
|
229
|
+ if result_dec:
|
|
230
|
+ if result_dec.group() == decimal_data:
|
|
231
|
+ return sign, integer_data, decimal_data, False
|
|
232
|
+ else:
|
|
233
|
+ return sign, integer_data, decimal_data, False
|
|
234
|
+
|
|
235
|
+ raise ValueError(f"不符合格式的数据:{check_data}")
|
|
236
|
+
|
|
237
|
+ def __integer_convert(self, integer_data: str) -> int:
|
|
238
|
+ # 核心
|
|
239
|
+ output_integer = 0
|
|
240
|
+ unit = 1
|
|
241
|
+ ten_thousand_unit = 1
|
|
242
|
+ for index, cn_num in enumerate(reversed(integer_data)):
|
|
243
|
+ # 数值
|
|
244
|
+ if cn_num in NUMBER_CN2AN:
|
|
245
|
+ num = NUMBER_CN2AN[cn_num]
|
|
246
|
+ output_integer += num * unit
|
|
247
|
+ # 单位
|
|
248
|
+ elif cn_num in UNIT_CN2AN:
|
|
249
|
+ unit = UNIT_CN2AN[cn_num]
|
|
250
|
+ # 判断出万、亿、万亿
|
|
251
|
+ if unit % 10000 == 0:
|
|
252
|
+ # 万 亿
|
|
253
|
+ if unit > ten_thousand_unit:
|
|
254
|
+ ten_thousand_unit = unit
|
|
255
|
+ # 万亿
|
|
256
|
+ else:
|
|
257
|
+ ten_thousand_unit = unit * ten_thousand_unit
|
|
258
|
+ unit = ten_thousand_unit
|
|
259
|
+
|
|
260
|
+ if unit < ten_thousand_unit:
|
|
261
|
+ unit = unit * ten_thousand_unit
|
|
262
|
+
|
|
263
|
+ if index == len(integer_data) - 1:
|
|
264
|
+ output_integer += unit
|
|
265
|
+ else:
|
|
266
|
+ raise ValueError(f"{cn_num} 不在转化范围内")
|
|
267
|
+
|
|
268
|
+ return int(output_integer)
|
|
269
|
+
|
|
270
|
+ def __decimal_convert(self, decimal_data: str) -> float:
|
|
271
|
+ len_decimal_data = len(decimal_data)
|
|
272
|
+
|
|
273
|
+ if len_decimal_data > 16:
|
|
274
|
+ warn(f"注意:小数部分长度为 {len_decimal_data} ,将自动截取前 16 位有效精度!")
|
|
275
|
+ decimal_data = decimal_data[:16]
|
|
276
|
+ len_decimal_data = 16
|
|
277
|
+
|
|
278
|
+ output_decimal = 0
|
|
279
|
+ for index in range(len(decimal_data) - 1, -1, -1):
|
|
280
|
+ unit_key = NUMBER_CN2AN[decimal_data[index]]
|
|
281
|
+ output_decimal += unit_key * 10 ** -(index + 1)
|
|
282
|
+
|
|
283
|
+ # 处理精度溢出问题
|
|
284
|
+ output_decimal = round(output_decimal, len_decimal_data)
|
|
285
|
+
|
|
286
|
+ return output_decimal
|
|
287
|
+
|
|
288
|
+ def __direct_convert(self, data: str) -> int:
|
|
289
|
+ output_data = 0
|
|
290
|
+ for index in range(len(data) - 1, -1, -1):
|
|
291
|
+ unit_key = NUMBER_CN2AN[data[index]]
|
|
292
|
+ output_data += unit_key * 10 ** (len(data) - index - 1)
|
|
293
|
+
|
|
294
|
+ return output_data
|
|
|
@@ -0,0 +1,215 @@
|
|
1
|
+import unittest
|
|
2
|
+
|
|
3
|
+from .cn2an import Cn2An
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+class Cn2anTest(unittest.TestCase):
|
|
7
|
+ def setUp(self) -> None:
|
|
8
|
+ self.strict_data_dict = {
|
|
9
|
+ "零": 0,
|
|
10
|
+ "一": 1,
|
|
11
|
+ "十": 10,
|
|
12
|
+ "十一": 11,
|
|
13
|
+ "一十一": 11,
|
|
14
|
+ "二十": 20,
|
|
15
|
+ "二十一": 21,
|
|
16
|
+ "一百": 100,
|
|
17
|
+ "一百零一": 101,
|
|
18
|
+ "一百一十": 110,
|
|
19
|
+ "一百一十一": 111,
|
|
20
|
+ "一千": 1000,
|
|
21
|
+ "一千一百": 1100,
|
|
22
|
+ "一千一百一十": 1110,
|
|
23
|
+ "一千一百一十一": 1111,
|
|
24
|
+ "一千零一十": 1010,
|
|
25
|
+ "一千零十": 1010,
|
|
26
|
+ "一千零十一": 1011,
|
|
27
|
+ "一千零一十一": 1011,
|
|
28
|
+ "一千零一": 1001,
|
|
29
|
+ "一千一百零一": 1101,
|
|
30
|
+ "一万一千一百一十一": 11111,
|
|
31
|
+ "一十一万一千一百一十一": 111111,
|
|
32
|
+ "一百一十一万一千一百一十一": 1111111,
|
|
33
|
+ "一千一百一十一万一千一百一十一": 11111111,
|
|
34
|
+ "一亿一千一百一十一万一千一百一十一": 111111111,
|
|
35
|
+ "一十一亿一千一百一十一万一千一百一十一": 1111111111,
|
|
36
|
+ "一百一十一亿一千一百一十一万一千一百一十一": 11111111111,
|
|
37
|
+ "一千一百一十一亿一千一百一十一万一千一百一十一": 111111111111,
|
|
38
|
+ "一千一百一十一万一千一百一十一亿一千一百一十一万一千一百一十一": 1111111111111111,
|
|
39
|
+ "壹": 1,
|
|
40
|
+ "拾": 10,
|
|
41
|
+ "拾壹": 11,
|
|
42
|
+ "壹拾壹": 11,
|
|
43
|
+ "壹佰壹拾壹": 111,
|
|
44
|
+ "壹仟壹佰壹拾壹": 1111,
|
|
45
|
+ "壹万壹仟壹佰壹拾壹": 11111,
|
|
46
|
+ "壹拾壹万壹仟壹佰壹拾壹": 111111,
|
|
47
|
+ "壹佰壹拾壹万壹仟壹佰壹拾壹": 1111111,
|
|
48
|
+ "壹仟壹佰壹拾壹万壹仟壹佰壹拾壹": 11111111,
|
|
49
|
+ "壹亿壹仟壹佰壹拾壹万壹仟壹佰壹拾壹": 111111111,
|
|
50
|
+ "壹拾壹亿壹仟壹佰壹拾壹万壹仟壹佰壹拾壹": 1111111111,
|
|
51
|
+ "壹佰壹拾壹亿壹仟壹佰壹拾壹万壹仟壹佰壹拾壹": 11111111111,
|
|
52
|
+ "壹仟壹佰壹拾壹亿壹仟壹佰壹拾壹万壹仟壹佰壹拾壹": 111111111111,
|
|
53
|
+ "壹拾壹元整": 11,
|
|
54
|
+ "壹佰壹拾壹圆整": 111,
|
|
55
|
+ "壹拾壹元正": 11,
|
|
56
|
+ "壹拾壹圆正": 11,
|
|
57
|
+ "壹拾壹元壹角": 11.1,
|
|
58
|
+ "壹拾壹元壹角壹分": 11.11,
|
|
59
|
+ "十万": 100000,
|
|
60
|
+ "十万零一": 100001,
|
|
61
|
+ "一万零一": 10001,
|
|
62
|
+ "一万零一十一": 10011,
|
|
63
|
+ "一万零一百一十一": 10111,
|
|
64
|
+ "一万零百一十一": 10111,
|
|
65
|
+ "一十万零一": 100001,
|
|
66
|
+ "一百万零一": 1000001,
|
|
67
|
+ "一千万零一": 10000001,
|
|
68
|
+ "一千零一万一千零一": 10011001,
|
|
69
|
+ "一千零一万零一": 10010001,
|
|
70
|
+ "一亿零一": 100000001,
|
|
71
|
+ "一十亿零一": 1000000001,
|
|
72
|
+ "一百亿零一": 10000000001,
|
|
73
|
+ "一千零一亿一千零一万一千零一": 100110011001,
|
|
74
|
+ "一千亿一千万一千零一": 100010001001,
|
|
75
|
+ "一千亿零一": 100000000001,
|
|
76
|
+ "零点零零零零零零零零零零零零零零一": 0.000000000000001,
|
|
77
|
+ "零点零零零零零零零零零零零零零一": 0.00000000000001,
|
|
78
|
+ "零点零零零零零零零零零零零零一": 0.0000000000001,
|
|
79
|
+ "零点零零零零零零零零零零零一": 0.000000000001,
|
|
80
|
+ "零点零零零零零零零零零零一": 0.00000000001,
|
|
81
|
+ "零点零零零零零零零零零一": 0.0000000001,
|
|
82
|
+ "零点零零零零零零零零一": 0.000000001,
|
|
83
|
+ "零点零零零零零零零一": 0.00000001,
|
|
84
|
+ "零点零零零零零零一": 0.0000001,
|
|
85
|
+ "零点零零零零零一": 0.000001,
|
|
86
|
+ "零点零零零零一": 0.00001,
|
|
87
|
+ "零点零零零一": 0.0001,
|
|
88
|
+ "零点零零一": 0.001,
|
|
89
|
+ "零点零一": 0.01,
|
|
90
|
+ "零点一": 0.1,
|
|
91
|
+ "负一": -1,
|
|
92
|
+ "负二": -2,
|
|
93
|
+ "负十": -10,
|
|
94
|
+ "负十一": -11,
|
|
95
|
+ "负一十一": -11,
|
|
96
|
+ # 古语
|
|
97
|
+ "廿二": 22,
|
|
98
|
+ }
|
|
99
|
+
|
|
100
|
+ self.normal_data_dict = {
|
|
101
|
+ "一一": 11,
|
|
102
|
+ "一一一": 111,
|
|
103
|
+ "壹壹": 11,
|
|
104
|
+ "壹壹壹": 111,
|
|
105
|
+ "零点零": 0,
|
|
106
|
+ "零点零零": 0,
|
|
107
|
+ "一七二零": 1720,
|
|
108
|
+ "一七二零点一": 1720.1,
|
|
109
|
+ "一七二零点一三四": 1720.134,
|
|
110
|
+ "一二三": 123,
|
|
111
|
+ "负零点一零": -0.1,
|
|
112
|
+ "负一七二零": -1720,
|
|
113
|
+ "负一七二零点一": -1720.1,
|
|
114
|
+ # 口语
|
|
115
|
+ "三万五": 35000,
|
|
116
|
+ "十三万五": 135000,
|
|
117
|
+ "两千六": 2600,
|
|
118
|
+ "一百二": 120,
|
|
119
|
+ "一百二十万三": 1203000,
|
|
120
|
+ # 繁体
|
|
121
|
+ "兩千六": 2600,
|
|
122
|
+ # 大写
|
|
123
|
+ "壹拾壹元": 11,
|
|
124
|
+ "壹佰壹拾壹圆": 111,
|
|
125
|
+ "壹拾壹圆": 11,
|
|
126
|
+ # 特殊
|
|
127
|
+ "〇": 0,
|
|
128
|
+ }
|
|
129
|
+
|
|
130
|
+ self.smart_data_dict = {
|
|
131
|
+ "100万": 1000000,
|
|
132
|
+ "100万三千": 1003000,
|
|
133
|
+ "200亿零四千230": 20000004230,
|
|
134
|
+ "一百点123": 100.123,
|
|
135
|
+ "10.1万": 101000,
|
|
136
|
+ "-10.1万": -101000,
|
|
137
|
+ "35.1亿": 3510000000,
|
|
138
|
+ "10.1": 10.1,
|
|
139
|
+ "-10.1": -10.1,
|
|
140
|
+ }
|
|
141
|
+
|
|
142
|
+ self.error_smart_datas = [
|
|
143
|
+ "10.1万零100",
|
|
144
|
+ "10..1万",
|
|
145
|
+ ]
|
|
146
|
+
|
|
147
|
+ self.error_normal_datas = [
|
|
148
|
+ "零点",
|
|
149
|
+ "点零",
|
|
150
|
+ "零点点",
|
|
151
|
+ "零点零大",
|
|
152
|
+ ]
|
|
153
|
+ self.error_normal_datas.extend(self.error_smart_datas)
|
|
154
|
+ self.error_normal_datas.extend(list(self.smart_data_dict.keys()))
|
|
155
|
+
|
|
156
|
+ self.error_strict_datas = [
|
|
157
|
+ "一一",
|
|
158
|
+ "壹壹",
|
|
159
|
+ "零点",
|
|
160
|
+ "点零",
|
|
161
|
+ "点一",
|
|
162
|
+ "百十一",
|
|
163
|
+ "十一十二",
|
|
164
|
+ "负十一十二",
|
|
165
|
+ "十七十八",
|
|
166
|
+ ]
|
|
167
|
+ self.error_strict_datas.extend(self.error_normal_datas)
|
|
168
|
+ self.error_strict_datas.extend(list(self.normal_data_dict.keys()))
|
|
169
|
+
|
|
170
|
+ # 不可修改位置
|
|
171
|
+ self.normal_data_dict.update(self.strict_data_dict)
|
|
172
|
+ self.smart_data_dict.update(self.normal_data_dict)
|
|
173
|
+
|
|
174
|
+ self.ca = Cn2An()
|
|
175
|
+
|
|
176
|
+ def test_cn2an(self) -> None:
|
|
177
|
+ for strict_item in self.strict_data_dict.keys():
|
|
178
|
+ self.assertEqual(self.ca.cn2an(strict_item, "strict"),
|
|
179
|
+ self.strict_data_dict[strict_item])
|
|
180
|
+
|
|
181
|
+ for normal_item in self.normal_data_dict.keys():
|
|
182
|
+ self.assertEqual(self.ca.cn2an(normal_item, "normal"),
|
|
183
|
+ self.normal_data_dict[normal_item])
|
|
184
|
+
|
|
185
|
+ for smart_item in self.smart_data_dict.keys():
|
|
186
|
+ self.assertEqual(self.ca.cn2an(smart_item, "smart"),
|
|
187
|
+ self.smart_data_dict[smart_item])
|
|
188
|
+
|
|
189
|
+ for error_strict_item in self.error_strict_datas:
|
|
190
|
+ try:
|
|
191
|
+ self.ca.cn2an(error_strict_item)
|
|
192
|
+ except ValueError as e:
|
|
193
|
+ self.assertEqual(type(e), ValueError)
|
|
194
|
+ else:
|
|
195
|
+ raise Exception(f'ValueError not raised: {error_strict_item}')
|
|
196
|
+
|
|
197
|
+ for error_normal_item in self.error_normal_datas:
|
|
198
|
+ try:
|
|
199
|
+ self.ca.cn2an(error_normal_item)
|
|
200
|
+ except ValueError as e:
|
|
201
|
+ self.assertEqual(type(e), ValueError)
|
|
202
|
+ else:
|
|
203
|
+ raise Exception(f'ValueError not raised: {error_normal_item}')
|
|
204
|
+
|
|
205
|
+ for error_smart_item in self.error_smart_datas:
|
|
206
|
+ try:
|
|
207
|
+ self.ca.cn2an(error_smart_item)
|
|
208
|
+ except ValueError as e:
|
|
209
|
+ self.assertEqual(type(e), ValueError)
|
|
210
|
+ else:
|
|
211
|
+ raise Exception(f'ValueError not raised: {error_smart_item}')
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+if __name__ == '__main__':
|
|
215
|
+ unittest.main()
|
|
|
@@ -0,0 +1,135 @@
|
|
1
|
+NUMBER_CN2AN = {
|
|
2
|
+ "零": 0,
|
|
3
|
+ "〇": 0,
|
|
4
|
+ "一": 1,
|
|
5
|
+ "壹": 1,
|
|
6
|
+ "幺": 1,
|
|
7
|
+ "二": 2,
|
|
8
|
+ "贰": 2,
|
|
9
|
+ "两": 2,
|
|
10
|
+ "三": 3,
|
|
11
|
+ "叁": 3,
|
|
12
|
+ "四": 4,
|
|
13
|
+ "肆": 4,
|
|
14
|
+ "五": 5,
|
|
15
|
+ "伍": 5,
|
|
16
|
+ "六": 6,
|
|
17
|
+ "陆": 6,
|
|
18
|
+ "七": 7,
|
|
19
|
+ "柒": 7,
|
|
20
|
+ "八": 8,
|
|
21
|
+ "捌": 8,
|
|
22
|
+ "九": 9,
|
|
23
|
+ "玖": 9,
|
|
24
|
+}
|
|
25
|
+UNIT_CN2AN = {
|
|
26
|
+ "十": 10,
|
|
27
|
+ "拾": 10,
|
|
28
|
+ "百": 100,
|
|
29
|
+ "佰": 100,
|
|
30
|
+ "千": 1000,
|
|
31
|
+ "仟": 1000,
|
|
32
|
+ "万": 10000,
|
|
33
|
+ "亿": 100000000,
|
|
34
|
+}
|
|
35
|
+UNIT_LOW_AN2CN = {
|
|
36
|
+ 10: "十",
|
|
37
|
+ 100: "百",
|
|
38
|
+ 1000: "千",
|
|
39
|
+ 10000: "万",
|
|
40
|
+ 100000000: "亿",
|
|
41
|
+}
|
|
42
|
+NUMBER_LOW_AN2CN = {
|
|
43
|
+ 0: "零",
|
|
44
|
+ 1: "一",
|
|
45
|
+ 2: "二",
|
|
46
|
+ 3: "三",
|
|
47
|
+ 4: "四",
|
|
48
|
+ 5: "五",
|
|
49
|
+ 6: "六",
|
|
50
|
+ 7: "七",
|
|
51
|
+ 8: "八",
|
|
52
|
+ 9: "九",
|
|
53
|
+}
|
|
54
|
+NUMBER_UP_AN2CN = {
|
|
55
|
+ 0: "零",
|
|
56
|
+ 1: "壹",
|
|
57
|
+ 2: "贰",
|
|
58
|
+ 3: "叁",
|
|
59
|
+ 4: "肆",
|
|
60
|
+ 5: "伍",
|
|
61
|
+ 6: "陆",
|
|
62
|
+ 7: "柒",
|
|
63
|
+ 8: "捌",
|
|
64
|
+ 9: "玖",
|
|
65
|
+}
|
|
66
|
+UNIT_LOW_ORDER_AN2CN = [
|
|
67
|
+ "",
|
|
68
|
+ "十",
|
|
69
|
+ "百",
|
|
70
|
+ "千",
|
|
71
|
+ "万",
|
|
72
|
+ "十",
|
|
73
|
+ "百",
|
|
74
|
+ "千",
|
|
75
|
+ "亿",
|
|
76
|
+ "十",
|
|
77
|
+ "百",
|
|
78
|
+ "千",
|
|
79
|
+ "万",
|
|
80
|
+ "十",
|
|
81
|
+ "百",
|
|
82
|
+ "千",
|
|
83
|
+]
|
|
84
|
+UNIT_UP_ORDER_AN2CN = [
|
|
85
|
+ "",
|
|
86
|
+ "拾",
|
|
87
|
+ "佰",
|
|
88
|
+ "仟",
|
|
89
|
+ "万",
|
|
90
|
+ "拾",
|
|
91
|
+ "佰",
|
|
92
|
+ "仟",
|
|
93
|
+ "亿",
|
|
94
|
+ "拾",
|
|
95
|
+ "佰",
|
|
96
|
+ "仟",
|
|
97
|
+ "万",
|
|
98
|
+ "拾",
|
|
99
|
+ "佰",
|
|
100
|
+ "仟",
|
|
101
|
+]
|
|
102
|
+STRICT_CN_NUMBER = {
|
|
103
|
+ "零": "零",
|
|
104
|
+ "一": "一壹",
|
|
105
|
+ "二": "二贰",
|
|
106
|
+ "三": "三叁",
|
|
107
|
+ "四": "四肆",
|
|
108
|
+ "五": "五伍",
|
|
109
|
+ "六": "六陆",
|
|
110
|
+ "七": "七柒",
|
|
111
|
+ "八": "八捌",
|
|
112
|
+ "九": "九玖",
|
|
113
|
+ "十": "十拾",
|
|
114
|
+ "百": "百佰",
|
|
115
|
+ "千": "千仟",
|
|
116
|
+ "万": "万",
|
|
117
|
+ "亿": "亿",
|
|
118
|
+}
|
|
119
|
+NORMAL_CN_NUMBER = {
|
|
120
|
+ "零": "零〇",
|
|
121
|
+ "一": "一壹幺",
|
|
122
|
+ "二": "二贰两",
|
|
123
|
+ "三": "三叁仨",
|
|
124
|
+ "四": "四肆",
|
|
125
|
+ "五": "五伍",
|
|
126
|
+ "六": "六陆",
|
|
127
|
+ "七": "七柒",
|
|
128
|
+ "八": "八捌",
|
|
129
|
+ "九": "九玖",
|
|
130
|
+ "十": "十拾",
|
|
131
|
+ "百": "百佰",
|
|
132
|
+ "千": "千仟",
|
|
133
|
+ "万": "万",
|
|
134
|
+ "亿": "亿",
|
|
135
|
+}
|
|
|
@@ -0,0 +1,29 @@
|
|
1
|
+import torbjorn as tbn
|
|
2
|
+
|
|
3
|
+from .an2cn import An2Cn
|
|
4
|
+from .cn2an import Cn2An
|
|
5
|
+
|
|
6
|
+ac = An2Cn()
|
|
7
|
+ca = Cn2An()
|
|
8
|
+
|
|
9
|
+an = 9876543298765432
|
|
10
|
+cn = "九千八百七十六万五千四百三十二亿九千八百七十六万五千四百三十二"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+@tbn.run_time
|
|
14
|
+def run_cn2an_ten_thousand_times() -> None:
|
|
15
|
+ for _ in range(10000):
|
|
16
|
+ result = ca.cn2an(cn)
|
|
17
|
+ assert result == an
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+@tbn.run_time
|
|
21
|
+def run_an2cn_ten_thousand_times() -> None:
|
|
22
|
+ for _ in range(10000):
|
|
23
|
+ result = ac.an2cn(an)
|
|
24
|
+ assert result == cn
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+if __name__ == '__main__':
|
|
28
|
+ run_cn2an_ten_thousand_times()
|
|
29
|
+ run_an2cn_ten_thousand_times()
|
|
|
@@ -0,0 +1,104 @@
|
|
1
|
+import re
|
|
2
|
+from warnings import warn
|
|
3
|
+
|
|
4
|
+from .cn2an import Cn2An
|
|
5
|
+from .an2cn import An2Cn
|
|
6
|
+from .conf import UNIT_CN2AN
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+class Transform(object):
|
|
10
|
+ def __init__(self) -> None:
|
|
11
|
+ self.all_num = "零一二三四五六七八九"
|
|
12
|
+ self.all_unit = "".join(list(UNIT_CN2AN.keys()))
|
|
13
|
+ self.cn2an = Cn2An().cn2an
|
|
14
|
+ self.an2cn = An2Cn().an2cn
|
|
15
|
+ self.cn_pattern = f"负?([{self.all_num}{self.all_unit}]+点)?[{self.all_num}{self.all_unit}]+"
|
|
16
|
+ self.smart_cn_pattern = f"-?([0-9]+.)?[0-9]+[{self.all_unit}]+"
|
|
17
|
+
|
|
18
|
+ def transform(self, inputs: str, method: str = "cn2an") -> str:
|
|
19
|
+ if method == "cn2an":
|
|
20
|
+ inputs = inputs.replace("廿", "二十").replace("半", "0.5").replace("两", "2")
|
|
21
|
+ # date
|
|
22
|
+ inputs = re.sub(
|
|
23
|
+ fr"((({self.smart_cn_pattern})|({self.cn_pattern}))年)?([{self.all_num}十]+月)?([{self.all_num}十]+日)?",
|
|
24
|
+ lambda x: self.__sub_util(x.group(), "cn2an", "date"), inputs)
|
|
25
|
+ # fraction
|
|
26
|
+ inputs = re.sub(fr"{self.cn_pattern}分之{self.cn_pattern}",
|
|
27
|
+ lambda x: self.__sub_util(x.group(), "cn2an", "fraction"), inputs)
|
|
28
|
+ # percent
|
|
29
|
+ inputs = re.sub(fr"百分之{self.cn_pattern}",
|
|
30
|
+ lambda x: self.__sub_util(x.group(), "cn2an", "percent"), inputs)
|
|
31
|
+ # celsius
|
|
32
|
+ inputs = re.sub(fr"{self.cn_pattern}摄氏度",
|
|
33
|
+ lambda x: self.__sub_util(x.group(), "cn2an", "celsius"), inputs)
|
|
34
|
+ # number
|
|
35
|
+ output = re.sub(self.cn_pattern,
|
|
36
|
+ lambda x: self.__sub_util(x.group(), "cn2an", "number"), inputs)
|
|
37
|
+
|
|
38
|
+ elif method == "an2cn":
|
|
39
|
+ # date
|
|
40
|
+ inputs = re.sub(r"(\d{2,4}年)?(\d{1,2}月)?(\d{1,2}日)?",
|
|
41
|
+ lambda x: self.__sub_util(x.group(), "an2cn", "date"), inputs)
|
|
42
|
+ # fraction
|
|
43
|
+ inputs = re.sub(r"\d+/\d+",
|
|
44
|
+ lambda x: self.__sub_util(x.group(), "an2cn", "fraction"), inputs)
|
|
45
|
+ # percent
|
|
46
|
+ inputs = re.sub(r"-?(\d+\.)?\d+%",
|
|
47
|
+ lambda x: self.__sub_util(x.group(), "an2cn", "percent"), inputs)
|
|
48
|
+ # celsius
|
|
49
|
+ inputs = re.sub(r"\d+℃",
|
|
50
|
+ lambda x: self.__sub_util(x.group(), "an2cn", "celsius"), inputs)
|
|
51
|
+ # number
|
|
52
|
+ output = re.sub(r"-?(\d+\.)?\d+",
|
|
53
|
+ lambda x: self.__sub_util(x.group(), "an2cn", "number"), inputs)
|
|
54
|
+ else:
|
|
55
|
+ raise ValueError(f"error method: {method}, only support 'cn2an' and 'an2cn'!")
|
|
56
|
+
|
|
57
|
+ return output
|
|
58
|
+
|
|
59
|
+ def __sub_util(self, inputs, method: str = "cn2an", sub_mode: str = "number") -> str:
|
|
60
|
+ try:
|
|
61
|
+ if inputs:
|
|
62
|
+ if method == "cn2an":
|
|
63
|
+ if sub_mode == "date":
|
|
64
|
+ return re.sub(fr"(({self.smart_cn_pattern})|({self.cn_pattern}))",
|
|
65
|
+ lambda x: str(self.cn2an(x.group(), "smart")), inputs)
|
|
66
|
+ elif sub_mode == "fraction":
|
|
67
|
+ if inputs[0] != "百":
|
|
68
|
+ frac_result = re.sub(self.cn_pattern,
|
|
69
|
+ lambda x: str(self.cn2an(x.group(), "smart")), inputs)
|
|
70
|
+ numerator, denominator = frac_result.split("分之")
|
|
71
|
+ return f"{denominator}/{numerator}"
|
|
72
|
+ else:
|
|
73
|
+ return inputs
|
|
74
|
+ elif sub_mode == "percent":
|
|
75
|
+ return re.sub(f"(?<=百分之){self.cn_pattern}",
|
|
76
|
+ lambda x: str(self.cn2an(x.group(), "smart")), inputs).replace("百分之", "") + "%"
|
|
77
|
+ elif sub_mode == "celsius":
|
|
78
|
+ return re.sub(f"{self.cn_pattern}(?=摄氏度)",
|
|
79
|
+ lambda x: str(self.cn2an(x.group(), "smart")), inputs).replace("摄氏度", "℃")
|
|
80
|
+ elif sub_mode == "number":
|
|
81
|
+ return str(self.cn2an(inputs, "smart"))
|
|
82
|
+ else:
|
|
83
|
+ raise Exception(f"error sub_mode: {sub_mode} !")
|
|
84
|
+ else:
|
|
85
|
+ if sub_mode == "date":
|
|
86
|
+ inputs = re.sub(r"\d+(?=年)",
|
|
87
|
+ lambda x: self.an2cn(x.group(), "direct"), inputs)
|
|
88
|
+ return re.sub(r"\d+",
|
|
89
|
+ lambda x: self.an2cn(x.group(), "low"), inputs)
|
|
90
|
+ elif sub_mode == "fraction":
|
|
91
|
+ frac_result = re.sub(r"\d+", lambda x: self.an2cn(x.group(), "low"), inputs)
|
|
92
|
+ numerator, denominator = frac_result.split("/")
|
|
93
|
+ return f"{denominator}分之{numerator}"
|
|
94
|
+ elif sub_mode == "celsius":
|
|
95
|
+ return self.an2cn(inputs[:-1], "low") + "摄氏度"
|
|
96
|
+ elif sub_mode == "percent":
|
|
97
|
+ return "百分之" + self.an2cn(inputs[:-1], "low")
|
|
98
|
+ elif sub_mode == "number":
|
|
99
|
+ return self.an2cn(inputs, "low")
|
|
100
|
+ else:
|
|
101
|
+ raise Exception(f"error sub_mode: {sub_mode} !")
|
|
102
|
+ except Exception as e:
|
|
103
|
+ warn(str(e))
|
|
104
|
+ return inputs
|
|
|
@@ -0,0 +1,40 @@
|
|
1
|
+import unittest
|
|
2
|
+
|
|
3
|
+from .transform import Transform
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+class TransformTest(unittest.TestCase):
|
|
7
|
+ def setUp(self) -> None:
|
|
8
|
+ self.strict_data_dict = {
|
|
9
|
+ "小王捡了100块钱": "小王捡了一百块钱",
|
|
10
|
+ "用户增长最快的3个城市": "用户增长最快的三个城市",
|
|
11
|
+ "小王的生日是2001年3月4日": "小王的生日是二零零一年三月四日",
|
|
12
|
+ "小王的生日是2012年12月12日": "小王的生日是二零一二年十二月十二日",
|
|
13
|
+ "今天股价上涨了8%": "今天股价上涨了百分之八",
|
|
14
|
+ "第2天股价下降了-3.8%": "第二天股价下降了百分之负三点八",
|
|
15
|
+ "抛出去的硬币为正面的概率是1/2": "抛出去的硬币为正面的概率是二分之一",
|
|
16
|
+ "现在室内温度为39℃,很热啊!": "现在室内温度为三十九摄氏度,很热啊!",
|
|
17
|
+ "创业板指9月9日早盘低开1.57%": "创业板指九月九日早盘低开百分之一点五七"
|
|
18
|
+ }
|
|
19
|
+
|
|
20
|
+ self.smart_data_dict = {
|
|
21
|
+ "约2.5亿年~6500万年": "约250000000年~65000000年",
|
|
22
|
+ "廿二日,日出东方": "22日,日出东方",
|
|
23
|
+ "大陆": "大陆",
|
|
24
|
+ "半斤": "0.5斤",
|
|
25
|
+ "两个": "2个",
|
|
26
|
+ }
|
|
27
|
+
|
|
28
|
+ self.t = Transform()
|
|
29
|
+
|
|
30
|
+ def test_transform(self) -> None:
|
|
31
|
+ for strict_item in self.strict_data_dict.keys():
|
|
32
|
+ self.assertEqual(self.t.transform(strict_item, "an2cn"), self.strict_data_dict[strict_item])
|
|
33
|
+ self.assertEqual(self.t.transform(self.strict_data_dict[strict_item], "cn2an"), strict_item)
|
|
34
|
+
|
|
35
|
+ for smart_item in self.smart_data_dict.keys():
|
|
36
|
+ self.assertEqual(self.t.transform(smart_item, "cn2an"), self.smart_data_dict[smart_item])
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+if __name__ == '__main__':
|
|
40
|
+ unittest.main()
|