discourse/lib/tiny_japanese_segmenter.rb
Alan Guo Xiang Tan 930f51e175 FEATURE: Split up text segmentation for Chinese and Japanese.
* Chinese segmenetation will continue to rely on cppjieba
* Japanese segmentation will use our port of TinySegmenter
* Korean currently does not rely on segmentation which was dropped in c677877e4fe5381f613279901f36ae255c909573
* SiteSetting.search_tokenize_chinese_japanese_korean has been split
into SiteSetting.search_tokenize_chinese and
SiteSetting.search_tokenize_japanese respectively
2022-02-07 09:21:14 +08:00

174 lines
24 KiB
Ruby

# frozen_string_literal: true
# Ruby port of http://chasen.org/~taku/software/TinySegmenter/tiny_segmenter-0.2.js
# This is esstentially a trained machine learning model used to segment words in Japanese.
# Discourse core uses it for "best effort" segmentation of Japanese text for search.
class TinyJapaneseSegmenter
CHARTYPE = {
"[一二三四五六七八九十百千万億兆]" => "M",
"[一-龠々〆ヵヶ]" => "H",
"[ぁ-ん]" => "I",
"[ァ-ヴーア-ン゙ー]" => "K",
"[a-zA-Za-zA-Z]" => "A",
"[0-90-9]" => "N"
}.map do |pattern, value|
[Regexp.compile(pattern), value]
end
BIAS = -322
BC1 = { "HH" => 6, "II" => 2461, "KH" => 406, "OH" => -1378 }
BC2 = { "AA" => -3267, "AI" => 2744, "AN" => -878, "HH" => -4070, "HM" => -1711, "HN" => 4012, "HO" => 3761, "IA" => 1327, "IH" => -1184, "II" => -1332, "IK" => 1721, "IO" => 5492, "KI" => 3831, "KK" => -8741, "MH" => -3132, "MK" => 3334, "OO" => -2920 }
BC3 = { "HH" => 996, "HI" => 626, "HK" => -721, "HN" => -1307, "HO" => -836, "IH" => -301, "KK" => 2762, "MK" => 1079, "MM" => 4034, "OA" => -1652, "OH" => 266 }
BP1 = { "BB" => 295, "OB" => 304, "OO" => -125, "UB" => 352 }
BP2 = { "BO" => 60, "OO" => -1762 }
BQ1 = { "BHH" => 1150, "BHM" => 1521, "BII" => -1158, "BIM" => 886, "BMH" => 1208, "BNH" => 449, "BOH" => -91, "BOO" => -2597, "OHI" => 451, "OIH" => -296, "OKA" => 1851, "OKH" => -1020, "OKK" => 904, "OOO" => 2965 }
BQ2 = { "BHH" => 118, "BHI" => -1159, "BHM" => 466, "BIH" => -919, "BKK" => -1720, "BKO" => 864, "OHH" => -1139, "OHM" => -181, "OIH" => 153, "UHI" => -1146 }
BQ3 = { "BHH" => -792, "BHI" => 2664, "BII" => -299, "BKI" => 419, "BMH" => 937, "BMM" => 8335, "BNN" => 998, "BOH" => 775, "OHH" => 2174, "OHM" => 439, "OII" => 280, "OKH" => 1798, "OKI" => -793, "OKO" => -2242, "OMH" => -2402, "OOO" => 11699 }
BQ4 = { "BHH" => -3895, "BIH" => 3761, "BII" => -4654, "BIK" => 1348, "BKK" => -1806, "BMI" => -3385, "BOO" => -12396, "OAH" => 926, "OHH" => 266, "OHK" => -2036, "ONN" => -973 }
BW1 = { ",と" => 660, ",同" => 727, "B1あ" => 1404, "B1同" => 542, "、と" => 660, "、同" => 727, "」と" => 1682, "あっ" => 1505, "いう" => 1743, "いっ" => -2055, "いる" => 672, "うし" => -4817, "うん" => 665, "から" => 3472, "がら" => 600, "こう" => -790, "こと" => 2083, "こん" => -1262, "さら" => -4143, "さん" => 4573, "した" => 2641, "" => 1104, "" => -3399, "" => 1977, "" => -871, "" => 1122, "" => 601, "" => 3463, "" => -802, "" => 805, "" => 1249, "" => 1127, "" => 3445, "" => 844, "" => -4915, "" => 1922, "" => 3887, "" => 5713, "" => 3015, "" => 7379, "" => -1113, "" => 2468, "" => 1498, "" => 1671, "" => -912, "" => -501, "" => 741, "" => 2448, "" => 1711, "" => 2600, "" => -2155, "" => -1947, "" => -2565, "" => 2369, "" => -913, "" => 1860, "" => 731, "" => -1886, "" => 2558, "" => -2784, "" => -2604, "" => 1497, "" => -2314, "" => -1336, "" => -195, "" => -2423, "" => -2113, "" => -724, "" => 1404, "" => 542, "" => 1682 }
BW2 = { ".." => -11822, "11" => -669, "" => -5730, "" => -13175, "" => -1609, "" => 2490, "" => -1350, "" => -602, "" => -7194, "" => 4612, "" => 853, "" => -3198, "" => 1941, "" => -1597, "" => -8392, "" => -4193, "" => 4533, "" => 13168, "" => -3977, "" => -1819, "" => -545, "" => 5078, "" => 972, "" => 939, "" => -3744, "" => -1253, "" => -662, "" => -3857, "" => -786, "" => 1224, "" => -939, "" => 4589, "" => 1647, "" => -2094, "" => 6144, "" => 3640, "" => 2551, "" => -3110, "" => -3065, "" => 2666, "" => -1528, "" => -3828, "" => -4761, "" => -4203, "" => 1890, "" => -1746, "" => -2279, "" => 720, "" => 5168, "" => -3941, "" => -2488, "" => -1313, "" => -6509, "" => 2614, "" => 3099, "" => -1615, "" => 2748, "" => 2454, "" => -7236, "" => -14943, "" => -4688, "" => -11388, "" => 2093, "" => -7059, "" => -6041, "" => -6125, "" => 1073, "" => -1033, "" => -2532, "" => 1813, "" => -1316, "" => -6621, "" => 5409, "" => -3153, "" => 2230, "" => -10713, "" => -944, "" => -1611, "" => -1897, "" => 651, "" => 1620, "" => 4270, "" => 849, "" => 4114, "" => 6067, "" => 7901, "" => -11877, "" => 728, "" => -4115, "" => 602, "" => -1375, "" => 970, "" => -1051, "" => -4479, "" => -1116, "" => 2163, "" => -7758, "" => 970, "" => -913, "" => -2471, "" => -1250, "" => -1050, "" => -8669, "" => -1626, "" => -2363, "" => -1982, "" => -4066, "" => -722, "" => -7068, "" => 3372, "" => -601, "" => -2355, "" => -2697, "" => -1543, "" => -1384, "" => -1276, "" => -990, "" => -1612, "" => -4268, "11" => -669 }
BW3 = { "" => -2194, "" => 719, "" => 3846, "." => -1185, "" => -1185, "" => 5308, "" => 2079, "" => 3029, "" => 2056, "" => 1883, "" => 5600, "" => 1527, "" => 1117, "" => 4798, "" => 1454, "." => 2857, "" => 2857, "" => -743, "" => -4098, "" => -669, "" => 6520, "" => -2670, "," => 1816, "" => 1816, "" => -4855, "" => -1127, "" => -913, "" => -4977, "" => -2064, "" => 1645, "" => 1374, "" => 7397, "" => 1542, "" => -2757, "" => -714, "" => 976, "," => 1557, "" => 1557, "" => -3714, "" => 3562, "" => 1449, "" => 2608, "" => 1200, "." => -1310, "" => -1310, "" => 6521, "," => 3426, "" => 3426, "" => 841, "" => 428, "." => 8875, "" => 8875, "" => -594, "" => 812, "" => -1183, "" => -853, "." => 4098, "" => 4098, "" => 1004, "" => -4748, "" => 300, "" => 6240, "" => 855, "" => 302, "" => 1437, "" => -1482, "" => 2295, "" => -1387, "" => 2266, "" => 541, "" => -3543, "" => 4664, "" => 1796, "" => -903, "" => 2135, "," => -1021, "" => -1021, "" => 1771, "" => 1906, "" => 2644, "," => -724, "" => -724, "" => -1000, "," => 1337, "" => 1337, "" => 2181, "" => 1113, "" => 6943, "" => -1549, "" => 6154, "" => -793, "" => 1479, "" => 6820, "" => 3818, "," => 854, "" => 854, "" => 1850, "" => 1375, "" => -3246, "" => 1091, "" => -605, "" => 606, "" => 798, "" => 990, "" => 860, "" => 1232, "" => 2217, "" => 1681, "" => 965, "" => -5055, "," => 974, "" => 974, "" => 2024, "" => 990 }
TC1 = { "AAA" => 1093, "HHH" => 1029, "HHM" => 580, "HII" => 998, "HOH" => -390, "HOM" => -331, "IHI" => 1169, "IOH" => -142, "IOI" => -1015, "IOM" => 467, "MMH" => 187, "OOI" => -1832 }
TC2 = { "HHO" => 2088, "HII" => -1023, "HMM" => -1154, "IHI" => -1965, "KKH" => 703, "OII" => -2649 }
TC3 = { "AAA" => -294, "HHH" => 346, "HHI" => -341, "HII" => -1088, "HIK" => 731, "HOH" => -1486, "IHH" => 128, "IHI" => -3041, "IHO" => -1935, "IIH" => -825, "IIM" => -1035, "IOI" => -542, "KHH" => -1216, "KKA" => 491, "KKH" => -1217, "KOK" => -1009, "MHH" => -2694, "MHM" => -457, "MHO" => 123, "MMH" => -471, "NNH" => -1689, "NNO" => 662, "OHO" => -3393 }
TC4 = { "HHH" => -203, "HHI" => 1344, "HHK" => 365, "HHM" => -122, "HHN" => 182, "HHO" => 669, "HIH" => 804, "HII" => 679, "HOH" => 446, "IHH" => 695, "IHO" => -2324, "IIH" => 321, "III" => 1497, "IIO" => 656, "IOO" => 54, "KAK" => 4845, "KKA" => 3386, "KKK" => 3065, "MHH" => -405, "MHI" => 201, "MMH" => -241, "MMM" => 661, "MOM" => 841 }
TQ1 = { "BHHH" => -227, "BHHI" => 316, "BHIH" => -132, "BIHH" => 60, "BIII" => 1595, "BNHH" => -744, "BOHH" => 225, "BOOO" => -908, "OAKK" => 482, "OHHH" => 281, "OHIH" => 249, "OIHI" => 200, "OIIH" => -68 }
TQ2 = { "BIHH" => -1401, "BIII" => -1033, "BKAK" => -543, "BOOO" => -5591 }
TQ3 = { "BHHH" => 478, "BHHM" => -1073, "BHIH" => 222, "BHII" => -504, "BIIH" => -116, "BIII" => -105, "BMHI" => -863, "BMHM" => -464, "BOMH" => 620, "OHHH" => 346, "OHHI" => 1729, "OHII" => 997, "OHMH" => 481, "OIHH" => 623, "OIIH" => 1344, "OKAK" => 2792, "OKHH" => 587, "OKKA" => 679, "OOHH" => 110, "OOII" => -685 }
TQ4 = { "BHHH" => -721, "BHHM" => -3604, "BHII" => -966, "BIIH" => -607, "BIII" => -2181, "OAAA" => -2763, "OAKK" => 180, "OHHH" => -294, "OHHI" => 2446, "OHHO" => 480, "OHIH" => -1573, "OIHH" => 1935, "OIHI" => -493, "OIIH" => 626, "OIII" => -4007, "OKAK" => -8156 }
TW1 = { "" => -4681, "" => 2026 }
TW2 = { "" => -2049, "" => -1256, "" => -2434, "" => 3873, "" => -4430, "" => -1049, "" => 1833, "" => -4657, "" => -4517, "" => 1882, "" => -792, "" => -1512, "" => -8097, "" => -1255, "" => -2721, "" => -3216 }
TW3 = { "" => -1734, "" => 1314, "" => -4314, "" => -5483, "" => -5989, "" => -6247, "," => -727, "" => -727, "" => -600, "" => -3752, "" => -2287 }
TW4 = { "." => 8576, "" => 8576, "" => -2348, "" => 2958, "," => 1516, "" => 1516, "" => 1538, "" => 1349, "" => 5543, "" => 1097, "" => -4258, "" => 5865 }
UC1 = { "A" => 484, "K" => 93, "M" => 645, "O" => -505 }
UC2 = { "A" => 819, "H" => 1059, "I" => 409, "M" => 3987, "N" => 5775, "O" => 646 }
UC3 = { "A" => -1370, "I" => 2311 }
UC4 = { "A" => -2643, "H" => 1809, "I" => -1032, "K" => -3450, "M" => 3565, "N" => 3876, "O" => 6646 }
UC5 = { "H" => 313, "I" => -1238, "K" => -799, "M" => 539, "O" => -831 }
UC6 = { "H" => -506, "I" => -253, "K" => 87, "M" => 247, "O" => -387 }
UP1 = { "O" => -214 }
UP2 = { "B" => 69, "O" => 935 }
UP3 = { "B" => 189 }
UQ1 = { "BH" => 21, "BI" => -12, "BK" => -99, "BN" => 142, "BO" => -56, "OH" => -95, "OI" => 477, "OK" => 410, "OO" => -2422 }
UQ2 = { "BH" => 216, "BI" => 113, "OK" => 1759 }
UQ3 = { "BA" => -479, "BH" => 42, "BI" => 1913, "BK" => -7198, "BM" => 3160, "BN" => 6427, "BO" => 14761, "OI" => -827, "ON" => -3212 }
UW1 = { "," => 156, "" => 156, "" => -463, "" => -941, "" => -127, "" => -553, "" => 121, "" => 505, "" => -201, "" => -547, "" => -123, "" => -789, "" => -185, "" => -847, "" => -466, "" => -470, "" => 182, "" => -292, "" => 208, "" => 169, "" => -446, "" => -137, "" => -135, "" => -402, "" => -268, "" => -912, "" => 871, "" => -460, "" => 561, "" => 729, "" => -411, "" => -141, "" => 361, "" => -408, "" => -386, "" => -718, "" => -463, "" => -135 }
UW2 = { "," => -829, "" => -829, "" => 892, "" => -645, "" => 3145, "" => -538, "" => 505, "" => 134, "" => -502, "" => 1454, "" => -856, "" => -412, "" => 1141, "" => 878, "" => 540, "" => 1529, "" => -675, "" => 300, "" => -1011, "" => 188, "" => 1837, "" => -949, "" => -291, "" => -268, "" => -981, "" => 1273, "" => 1063, "" => -1764, "" => 130, "" => -409, "" => -1273, "" => 1261, "" => 600, "" => -1263, "" => -402, "" => 1639, "" => -579, "" => -694, "" => 571, "" => -2516, "" => 2095, "" => -587, "" => 306, "" => 568, "" => 831, "" => -758, "" => -2150, "" => -302, "" => -968, "" => -861, "" => 492, "" => -123, "" => 978, "" => 362, "" => 548, "" => -3025, "" => -1566, "" => -3414, "" => -422, "" => -1769, "" => -865, "" => -483, "" => -1519, "" => 760, "" => 1023, "" => -2009, "" => -813, "" => -1060, "" => 1067, "" => -1519, "" => -1033, "" => 1522, "" => -1355, "" => -1682, "" => -1815, "" => -1462, "" => -630, "" => -1843, "" => -1650, "" => -931, "" => -665, "" => -2378, "" => -180, "" => -1740, "" => 752, "" => 529, "" => -1584, "" => -242, "" => -1165, "" => -763, "" => 810, "" => 509, "" => -1353, "" => 838, "西" => -744, "" => -3874, "調" => 1010, "" => 1198, "" => 3041, "" => 1758, "" => -1257, "" => -645, "" => 3145, "" => 831, "" => -587, "" => 306, "" => 568 }
UW3 = { "," => 4889, "1" => -800, "" => -1723, "" => 4889, "" => -2311, "" => 5827, "" => 2670, "" => -3573, "" => -2696, "" => 1006, "" => 2342, "" => 1983, "" => -4864, "" => -1163, "" => 3271, "" => 1004, "" => 388, "" => 401, "" => -3552, "" => -3116, "" => -1058, "" => -395, "" => 584, "" => 3685, "" => -5228, "" => 842, "" => -521, "" => -1444, "" => -1081, "" => 6167, "" => 2318, "" => 1691, "" => -899, "" => -2788, "" => 2745, "" => 4056, "" => 4555, "" => -2171, "" => -1798, "" => 1199, "" => -5516, "" => -4384, "" => -120, "" => 1205, "" => 2323, "" => -788, "" => -202, "" => 727, "" => 649, "" => 5905, "" => 2773, "" => -1207, "" => 6620, "" => -518, "" => 551, "" => 1319, "" => 874, "" => -1350, "" => 521, "" => 1109, "" => 1591, "" => 2201, "" => 278, "" => -3794, "" => -1619, "" => -1759, "" => -2087, "" => 3815, "" => 653, "" => -758, "" => -1193, "" => 974, "" => 2742, "" => 792, "" => 1889, "" => -1368, "" => 811, "" => 4265, "" => -361, "" => -2439, "" => 4858, "" => 3593, "" => 1574, "" => -3030, "" => 755, "" => -1880, "" => 5807, "" => 3095, "" => 457, "" => 2475, "" => 1129, "" => 2286, "" => 4437, "" => 365, "" => -949, "" => -1872, "" => 1327, "" => -1038, "" => 4646, "" => -2309, "" => -783, "" => -1006, "" => 483, "" => 1233, "" => 3588, "" => -241, "" => 3906, "" => -837, "" => 4513, "" => 642, "" => 1389, "" => 1219, "" => -241, "" => 2016, "" => -1356, "" => -423, "" => -1008, "" => 1078, "" => -513, "" => -3102, "" => 1155, "" => 3197, "" => -1804, "" => 2416, "" => -1030, "" => 1605, "" => 1452, "" => -2352, "" => -3885, "" => 1905, "" => -1291, "" => 1822, "" => -488, "" => -3973, "" => -2013, "" => -1479, "" => 3222, "" => -1489, "" => 1764, "" => 2099, "" => 5792, "" => -661, "" => -1248, "" => -951, "" => -937, "" => 4125, "" => 360, "" => 3094, "" => 364, "" => -805, "" => 5156, "" => 2438, "" => 484, "" => 2613, "" => -1694, "" => -1073, "" => 1868, "" => -495, "" => 979, "" => 461, "" => -3850, "" => -273, "" => 914, "" => 1215, "" => 7313, "" => -1835, "" => 792, "" => 6293, "" => -1528, "" => 4231, "" => 401, "" => -960, "" => 1201, "" => 7767, "" => 3066, "" => 3663, "" => 1384, "" => -4229, "" => 1163, "" => 1255, "" => 6457, "" => 725, "" => -2869, "" => 785, "" => 1044, "調" => -562, "" => -733, "" => 1777, "" => 1835, "" => 1375, "" => -1504, "" => -1136, "" => -681, "" => 1026, "" => 4404, "" => 1200, "" => 2163, "" => 421, "" => -1432, "" => 1302, "" => -1282, "" => 2009, "" => -1045, "" => 2066, "" => 1620, "" => -800, "" => 2670, "" => -3794, "" => -1350, "" => 551, "" => 1319, "" => 874, "" => 521, "" => 1109, "" => 1591, "" => 2201, "" => 278 }
UW4 = { "," => 3930, "." => 3508, "" => -4841, "" => 3930, "" => 3508, "" => 4999, "" => 1895, "" => 3798, "" => -5156, "" => 4752, "" => -3435, "" => -640, "" => -2514, "" => 2405, "" => 530, "" => 6006, "" => -4482, "" => -3821, "" => -3788, "" => -4376, "" => -4734, "" => 2255, "" => 1979, "" => 2864, "" => -843, "" => -2506, "" => -731, "" => 1251, "" => 181, "" => 4091, "" => 5034, "" => 5408, "" => -3654, "" => -5882, "" => -1659, "" => 3994, "" => 7410, "" => 4547, "" => 5433, "" => 6499, "" => 1853, "" => 1413, "" => 7396, "" => 8578, "" => 1940, "" => 4249, "" => -4134, "" => 1345, "" => 6665, "" => -744, "" => 1464, "" => 1051, "" => -2082, "" => -882, "" => -5046, "" => 4169, "" => -2666, "" => 2795, "" => -1544, "" => 3351, "" => -2922, "" => -9726, "" => -14896, "" => -2613, "" => -4570, "" => -1783, "" => 13150, "" => -2352, "" => 2145, "" => 1789, "" => 1287, "" => -724, "" => -403, "" => -1635, "" => -881, "" => -541, "" => -856, "" => -3637, "" => -4371, "" => -11870, "" => -2069, "" => 2210, "" => 782, "" => -190, "" => -1768, "" => 1036, "" => 544, "" => 950, "" => -1286, "" => 530, "" => 4292, "" => 601, "" => -2006, "" => -1212, "" => 584, "" => 788, "" => 1347, "" => 1623, "" => 3879, "" => -302, "" => -740, "" => -2715, "" => 776, "" => 4517, "" => 1013, "" => 1555, "" => -1834, "" => -681, "" => -910, "" => -851, "" => 1500, "" => -619, "" => -1200, "" => 866, "" => -1410, "" => -2094, "" => -1413, "" => 1067, "" => 571, "" => -4802, "" => -1397, "" => -1057, "" => -809, "" => 1910, "" => -1328, "" => -1500, "" => -2056, "" => -2667, "" => 2771, "" => 374, "" => -4556, "" => 456, "" => 553, "" => 916, "" => -1566, "" => 856, "" => 787, "" => 2182, "" => 704, "" => 522, "" => -856, "" => 1798, "" => 1829, "" => 845, "" => -9066, "" => -485, "" => -442, "" => -360, "" => -1043, "" => 5388, "" => -2716, "" => -910, "" => -939, "" => -543, "" => -735, "" => 672, "" => -1267, "" => -1286, "" => -1101, "" => -2900, "" => 1826, "" => 2586, "" => 922, "" => -3485, "" => 2997, "" => -867, "" => -2112, "" => 788, "" => 2937, "" => 786, "" => 2171, "" => 1146, "" => -1169, "" => 940, "" => -994, "" => 749, "" => 2145, "" => -730, "" => -852, "" => -792, "" => 792, "" => -1184, "" => -244, "" => -1000, "" => 730, "" => -1481, "" => 1158, "" => -1433, "" => -3370, "" => 929, "" => -1291, "" => 2596, "" => -4866, "" => 1192, "" => -1100, "" => -2213, "" => 357, "" => -2344, "" => -2297, "" => -2604, "" => -878, "" => -1659, "" => -792, "" => -1984, "" => 1749, "" => 2120, "" => 1895, "" => 3798, "" => -4371, "" => -724, "" => -11870, "" => 2145, "" => 1789, "" => 1287, "" => -403, "" => -1635, "" => -881, "" => -541, "" => -856, "" => -3637 }
UW5 = { "," => 465, "." => -299, "1" => -514, "E2" => -32768, "]" => -2762, "" => 465, "" => -299, "" => 363, "" => 1655, "" => 331, "" => -503, "" => 1199, "" => 527, "" => 647, "" => -421, "" => 1624, "" => 1971, "" => 312, "" => -983, "" => -1537, "" => -1371, "" => -852, "" => -1186, "" => 1093, "" => 52, "" => 921, "" => -18, "" => -850, "" => -127, "" => 1682, "" => -787, "" => -1224, "" => -635, "" => -578, "" => 1001, "" => 502, "" => 865, "" => 3350, "" => 854, "" => -208, "" => 429, "" => 504, "" => 419, "" => -1264, "" => 327, "" => 241, "" => 451, "" => -343, "" => -871, "" => 722, "" => -1153, "" => -654, "" => 3519, "" => -901, "" => 848, "" => 2104, "" => -1296, "" => -548, "" => 1785, "" => -1304, "" => -2991, "" => 921, "" => 1763, "" => 872, "" => -814, "" => 1618, "" => -1682, "" => 218, "" => -4353, "" => 932, "" => 1356, "" => -1508, "" => -1347, "" => 240, "" => -3912, "" => -3149, "" => 1319, "" => -1052, "" => -4003, "" => -997, "" => -278, "" => -813, "" => 1955, "" => -2233, "" => 663, "" => -1073, "" => 1219, "" => -1018, "" => -368, "" => 786, "" => 1191, "" => 2368, "" => -689, "" => -514, "" => -32768, "" => 363, "" => 241, "" => 451, "" => -343 }
UW6 = { "," => 227, "." => 808, "1" => -270, "E1" => 306, "" => 227, "" => 808, "" => -307, "" => 189, "" => 241, "" => -73, "" => -121, "" => -200, "" => 1782, "" => 383, "" => -428, "" => 573, "" => -1014, "" => 101, "" => -105, "" => -253, "" => -149, "" => -417, "" => -236, "" => -206, "" => 187, "" => -135, "" => 195, "" => -673, "" => -496, "" => -277, "" => 201, "" => -800, "" => 624, "" => 302, "" => 1792, "" => -1212, "" => 798, "" => -960, "" => 887, "" => -695, "" => 535, "" => -697, "" => 753, "" => -507, "" => 974, "" => -822, "" => 1811, "" => 463, "" => 1082, "" => -270, "" => 306, "" => -673, "" => -496 }
class << self
def segment(text)
return [] if text.nil? || text.strip.length == 0
result = []
segments = ["B3", "B2", "B1"]
ctypes = ["O", "O", "O"]
text.chars.each do |char|
segments << char
ctypes << ctype(char)
end
segments.concat(["E1", "E2", "E3"])
ctypes.concat(["O", "O", "O"])
word = segments[3]
p1 = "U"
p2 = "U"
p3 = "U"
4.upto(segments.size - 4) do |i|
score = BIAS
w1 = segments[i - 3]
w2 = segments[i - 2]
w3 = segments[i - 1]
w4 = segments[i]
w5 = segments[i + 1]
w6 = segments[i + 2]
c1 = ctypes[i - 3]
c2 = ctypes[i - 2]
c3 = ctypes[i - 1]
c4 = ctypes[i]
c5 = ctypes[i + 1]
c6 = ctypes[i + 2]
score += UP1[p1].to_i
score += UP2[p2].to_i
score += UP3[p3].to_i
score += BP1[p1 + p2].to_i
score += BP2[p2 + p3].to_i
score += UW1[w1].to_i
score += UW2[w2].to_i
score += UW3[w3].to_i
score += UW4[w4].to_i
score += UW5[w5].to_i
score += UW6[w6].to_i
score += BW1[w2 + w3].to_i
score += BW2[w3 + w4].to_i
score += BW3[w4 + w5].to_i
score += TW1[w1 + w2 + w3].to_i
score += TW2[w2 + w3 + w4].to_i
score += TW3[w3 + w4 + w5].to_i
score += TW4[w4 + w5 + w6].to_i
score += UC1[c1].to_i
score += UC2[c2].to_i
score += UC3[c3].to_i
score += UC4[c4].to_i
score += UC5[c5].to_i
score += UC6[c6].to_i
score += BC1[c2 + c3].to_i
score += BC2[c3 + c4].to_i
score += BC3[c4 + c5].to_i
score += TC1[c1 + c2 + c3].to_i
score += TC2[c2 + c3 + c4].to_i
score += TC3[c3 + c4 + c5].to_i
score += TC4[c4 + c5 + c6].to_i
# score += TC5[c4 + c5 + c6].to_i
score += UQ1[p1 + c1].to_i
score += UQ2[p2 + c2].to_i
score += UQ3[p3 + c3].to_i
score += BQ1[p2 + c2 + c3].to_i
score += BQ2[p2 + c3 + c4].to_i
score += BQ3[p3 + c2 + c3].to_i
score += BQ4[p3 + c3 + c4].to_i
score += TQ1[p2 + c1 + c2 + c3].to_i
score += TQ2[p2 + c2 + c3 + c4].to_i
score += TQ3[p3 + c1 + c2 + c3].to_i
score += TQ4[p3 + c2 + c3 + c4].to_i
p = "O"
if score > 0
result.push(word)
word = ""
p = "B"
end
p1 = p2
p2 = p3
p3 = p
word += segments[i]
end
result.push(word)
result
end
private
def ctype(text)
CHARTYPE.each do |regexp, value|
if text.match(regexp)
return value
end
end
"O"
end
end
end