mirror of
https://github.com/discourse/discourse.git
synced 2024-12-11 21:44:08 +08:00
1492 lines
30 KiB
Ruby
1492 lines
30 KiB
Ruby
# frozen_string_literal: true
|
||
|
||
# Ruby port of http://chasen.org/~taku/software/TinySegmenter/tiny_segmenter-0.2.js
|
||
# This is esstentially a trained machine learning model used to segment words in Japanese.
|
||
# Discourse core uses it for "best effort" segmentation of Japanese text for search.
|
||
class TinyJapaneseSegmenter
|
||
CHARTYPE =
|
||
{
|
||
"[一二三四五六七八九十百千万億兆]" => "M",
|
||
"[一-龠々〆ヵヶ]" => "H",
|
||
"[ぁ-ん]" => "I",
|
||
"[ァ-ヴーア-ン゙ー]" => "K",
|
||
"[a-zA-Za-zA-Z]" => "A",
|
||
"[0-90-9]" => "N",
|
||
}.map { |pattern, value| [Regexp.compile(pattern), value] }
|
||
|
||
BIAS = -322
|
||
BC1 = { "HH" => 6, "II" => 2461, "KH" => 406, "OH" => -1378 }
|
||
BC2 = {
|
||
"AA" => -3267,
|
||
"AI" => 2744,
|
||
"AN" => -878,
|
||
"HH" => -4070,
|
||
"HM" => -1711,
|
||
"HN" => 4012,
|
||
"HO" => 3761,
|
||
"IA" => 1327,
|
||
"IH" => -1184,
|
||
"II" => -1332,
|
||
"IK" => 1721,
|
||
"IO" => 5492,
|
||
"KI" => 3831,
|
||
"KK" => -8741,
|
||
"MH" => -3132,
|
||
"MK" => 3334,
|
||
"OO" => -2920,
|
||
}
|
||
BC3 = {
|
||
"HH" => 996,
|
||
"HI" => 626,
|
||
"HK" => -721,
|
||
"HN" => -1307,
|
||
"HO" => -836,
|
||
"IH" => -301,
|
||
"KK" => 2762,
|
||
"MK" => 1079,
|
||
"MM" => 4034,
|
||
"OA" => -1652,
|
||
"OH" => 266,
|
||
}
|
||
BP1 = { "BB" => 295, "OB" => 304, "OO" => -125, "UB" => 352 }
|
||
BP2 = { "BO" => 60, "OO" => -1762 }
|
||
BQ1 = {
|
||
"BHH" => 1150,
|
||
"BHM" => 1521,
|
||
"BII" => -1158,
|
||
"BIM" => 886,
|
||
"BMH" => 1208,
|
||
"BNH" => 449,
|
||
"BOH" => -91,
|
||
"BOO" => -2597,
|
||
"OHI" => 451,
|
||
"OIH" => -296,
|
||
"OKA" => 1851,
|
||
"OKH" => -1020,
|
||
"OKK" => 904,
|
||
"OOO" => 2965,
|
||
}
|
||
BQ2 = {
|
||
"BHH" => 118,
|
||
"BHI" => -1159,
|
||
"BHM" => 466,
|
||
"BIH" => -919,
|
||
"BKK" => -1720,
|
||
"BKO" => 864,
|
||
"OHH" => -1139,
|
||
"OHM" => -181,
|
||
"OIH" => 153,
|
||
"UHI" => -1146,
|
||
}
|
||
BQ3 = {
|
||
"BHH" => -792,
|
||
"BHI" => 2664,
|
||
"BII" => -299,
|
||
"BKI" => 419,
|
||
"BMH" => 937,
|
||
"BMM" => 8335,
|
||
"BNN" => 998,
|
||
"BOH" => 775,
|
||
"OHH" => 2174,
|
||
"OHM" => 439,
|
||
"OII" => 280,
|
||
"OKH" => 1798,
|
||
"OKI" => -793,
|
||
"OKO" => -2242,
|
||
"OMH" => -2402,
|
||
"OOO" => 11_699,
|
||
}
|
||
BQ4 = {
|
||
"BHH" => -3895,
|
||
"BIH" => 3761,
|
||
"BII" => -4654,
|
||
"BIK" => 1348,
|
||
"BKK" => -1806,
|
||
"BMI" => -3385,
|
||
"BOO" => -12_396,
|
||
"OAH" => 926,
|
||
"OHH" => 266,
|
||
"OHK" => -2036,
|
||
"ONN" => -973,
|
||
}
|
||
BW1 = {
|
||
",と" => 660,
|
||
",同" => 727,
|
||
"B1あ" => 1404,
|
||
"B1同" => 542,
|
||
"、と" => 660,
|
||
"、同" => 727,
|
||
"」と" => 1682,
|
||
"あっ" => 1505,
|
||
"いう" => 1743,
|
||
"いっ" => -2055,
|
||
"いる" => 672,
|
||
"うし" => -4817,
|
||
"うん" => 665,
|
||
"から" => 3472,
|
||
"がら" => 600,
|
||
"こう" => -790,
|
||
"こと" => 2083,
|
||
"こん" => -1262,
|
||
"さら" => -4143,
|
||
"さん" => 4573,
|
||
"した" => 2641,
|
||
"して" => 1104,
|
||
"すで" => -3399,
|
||
"そこ" => 1977,
|
||
"それ" => -871,
|
||
"たち" => 1122,
|
||
"ため" => 601,
|
||
"った" => 3463,
|
||
"つい" => -802,
|
||
"てい" => 805,
|
||
"てき" => 1249,
|
||
"でき" => 1127,
|
||
"です" => 3445,
|
||
"では" => 844,
|
||
"とい" => -4915,
|
||
"とみ" => 1922,
|
||
"どこ" => 3887,
|
||
"ない" => 5713,
|
||
"なっ" => 3015,
|
||
"など" => 7379,
|
||
"なん" => -1113,
|
||
"にし" => 2468,
|
||
"には" => 1498,
|
||
"にも" => 1671,
|
||
"に対" => -912,
|
||
"の一" => -501,
|
||
"の中" => 741,
|
||
"ませ" => 2448,
|
||
"まで" => 1711,
|
||
"まま" => 2600,
|
||
"まる" => -2155,
|
||
"やむ" => -1947,
|
||
"よっ" => -2565,
|
||
"れた" => 2369,
|
||
"れで" => -913,
|
||
"をし" => 1860,
|
||
"を見" => 731,
|
||
"亡く" => -1886,
|
||
"京都" => 2558,
|
||
"取り" => -2784,
|
||
"大き" => -2604,
|
||
"大阪" => 1497,
|
||
"平方" => -2314,
|
||
"引き" => -1336,
|
||
"日本" => -195,
|
||
"本当" => -2423,
|
||
"毎日" => -2113,
|
||
"目指" => -724,
|
||
"B1あ" => 1404,
|
||
"B1同" => 542,
|
||
"」と" => 1682,
|
||
}
|
||
BW2 = {
|
||
".." => -11_822,
|
||
"11" => -669,
|
||
"――" => -5730,
|
||
"−−" => -13_175,
|
||
"いう" => -1609,
|
||
"うか" => 2490,
|
||
"かし" => -1350,
|
||
"かも" => -602,
|
||
"から" => -7194,
|
||
"かれ" => 4612,
|
||
"がい" => 853,
|
||
"がら" => -3198,
|
||
"きた" => 1941,
|
||
"くな" => -1597,
|
||
"こと" => -8392,
|
||
"この" => -4193,
|
||
"させ" => 4533,
|
||
"され" => 13_168,
|
||
"さん" => -3977,
|
||
"しい" => -1819,
|
||
"しか" => -545,
|
||
"した" => 5078,
|
||
"して" => 972,
|
||
"しな" => 939,
|
||
"その" => -3744,
|
||
"たい" => -1253,
|
||
"たた" => -662,
|
||
"ただ" => -3857,
|
||
"たち" => -786,
|
||
"たと" => 1224,
|
||
"たは" => -939,
|
||
"った" => 4589,
|
||
"って" => 1647,
|
||
"っと" => -2094,
|
||
"てい" => 6144,
|
||
"てき" => 3640,
|
||
"てく" => 2551,
|
||
"ては" => -3110,
|
||
"ても" => -3065,
|
||
"でい" => 2666,
|
||
"でき" => -1528,
|
||
"でし" => -3828,
|
||
"です" => -4761,
|
||
"でも" => -4203,
|
||
"とい" => 1890,
|
||
"とこ" => -1746,
|
||
"とと" => -2279,
|
||
"との" => 720,
|
||
"とみ" => 5168,
|
||
"とも" => -3941,
|
||
"ない" => -2488,
|
||
"なが" => -1313,
|
||
"など" => -6509,
|
||
"なの" => 2614,
|
||
"なん" => 3099,
|
||
"にお" => -1615,
|
||
"にし" => 2748,
|
||
"にな" => 2454,
|
||
"によ" => -7236,
|
||
"に対" => -14_943,
|
||
"に従" => -4688,
|
||
"に関" => -11_388,
|
||
"のか" => 2093,
|
||
"ので" => -7059,
|
||
"のに" => -6041,
|
||
"のの" => -6125,
|
||
"はい" => 1073,
|
||
"はが" => -1033,
|
||
"はず" => -2532,
|
||
"ばれ" => 1813,
|
||
"まし" => -1316,
|
||
"まで" => -6621,
|
||
"まれ" => 5409,
|
||
"めて" => -3153,
|
||
"もい" => 2230,
|
||
"もの" => -10_713,
|
||
"らか" => -944,
|
||
"らし" => -1611,
|
||
"らに" => -1897,
|
||
"りし" => 651,
|
||
"りま" => 1620,
|
||
"れた" => 4270,
|
||
"れて" => 849,
|
||
"れば" => 4114,
|
||
"ろう" => 6067,
|
||
"われ" => 7901,
|
||
"を通" => -11_877,
|
||
"んだ" => 728,
|
||
"んな" => -4115,
|
||
"一人" => 602,
|
||
"一方" => -1375,
|
||
"一日" => 970,
|
||
"一部" => -1051,
|
||
"上が" => -4479,
|
||
"会社" => -1116,
|
||
"出て" => 2163,
|
||
"分の" => -7758,
|
||
"同党" => 970,
|
||
"同日" => -913,
|
||
"大阪" => -2471,
|
||
"委員" => -1250,
|
||
"少な" => -1050,
|
||
"年度" => -8669,
|
||
"年間" => -1626,
|
||
"府県" => -2363,
|
||
"手権" => -1982,
|
||
"新聞" => -4066,
|
||
"日新" => -722,
|
||
"日本" => -7068,
|
||
"日米" => 3372,
|
||
"曜日" => -601,
|
||
"朝鮮" => -2355,
|
||
"本人" => -2697,
|
||
"東京" => -1543,
|
||
"然と" => -1384,
|
||
"社会" => -1276,
|
||
"立て" => -990,
|
||
"第に" => -1612,
|
||
"米国" => -4268,
|
||
"11" => -669,
|
||
}
|
||
BW3 = {
|
||
"あた" => -2194,
|
||
"あり" => 719,
|
||
"ある" => 3846,
|
||
"い." => -1185,
|
||
"い。" => -1185,
|
||
"いい" => 5308,
|
||
"いえ" => 2079,
|
||
"いく" => 3029,
|
||
"いた" => 2056,
|
||
"いっ" => 1883,
|
||
"いる" => 5600,
|
||
"いわ" => 1527,
|
||
"うち" => 1117,
|
||
"うと" => 4798,
|
||
"えと" => 1454,
|
||
"か." => 2857,
|
||
"か。" => 2857,
|
||
"かけ" => -743,
|
||
"かっ" => -4098,
|
||
"かに" => -669,
|
||
"から" => 6520,
|
||
"かり" => -2670,
|
||
"が," => 1816,
|
||
"が、" => 1816,
|
||
"がき" => -4855,
|
||
"がけ" => -1127,
|
||
"がっ" => -913,
|
||
"がら" => -4977,
|
||
"がり" => -2064,
|
||
"きた" => 1645,
|
||
"けど" => 1374,
|
||
"こと" => 7397,
|
||
"この" => 1542,
|
||
"ころ" => -2757,
|
||
"さい" => -714,
|
||
"さを" => 976,
|
||
"し," => 1557,
|
||
"し、" => 1557,
|
||
"しい" => -3714,
|
||
"した" => 3562,
|
||
"して" => 1449,
|
||
"しな" => 2608,
|
||
"しま" => 1200,
|
||
"す." => -1310,
|
||
"す。" => -1310,
|
||
"する" => 6521,
|
||
"ず," => 3426,
|
||
"ず、" => 3426,
|
||
"ずに" => 841,
|
||
"そう" => 428,
|
||
"た." => 8875,
|
||
"た。" => 8875,
|
||
"たい" => -594,
|
||
"たの" => 812,
|
||
"たり" => -1183,
|
||
"たる" => -853,
|
||
"だ." => 4098,
|
||
"だ。" => 4098,
|
||
"だっ" => 1004,
|
||
"った" => -4748,
|
||
"って" => 300,
|
||
"てい" => 6240,
|
||
"てお" => 855,
|
||
"ても" => 302,
|
||
"です" => 1437,
|
||
"でに" => -1482,
|
||
"では" => 2295,
|
||
"とう" => -1387,
|
||
"とし" => 2266,
|
||
"との" => 541,
|
||
"とも" => -3543,
|
||
"どう" => 4664,
|
||
"ない" => 1796,
|
||
"なく" => -903,
|
||
"など" => 2135,
|
||
"に," => -1021,
|
||
"に、" => -1021,
|
||
"にし" => 1771,
|
||
"にな" => 1906,
|
||
"には" => 2644,
|
||
"の," => -724,
|
||
"の、" => -724,
|
||
"の子" => -1000,
|
||
"は," => 1337,
|
||
"は、" => 1337,
|
||
"べき" => 2181,
|
||
"まし" => 1113,
|
||
"ます" => 6943,
|
||
"まっ" => -1549,
|
||
"まで" => 6154,
|
||
"まれ" => -793,
|
||
"らし" => 1479,
|
||
"られ" => 6820,
|
||
"るる" => 3818,
|
||
"れ," => 854,
|
||
"れ、" => 854,
|
||
"れた" => 1850,
|
||
"れて" => 1375,
|
||
"れば" => -3246,
|
||
"れる" => 1091,
|
||
"われ" => -605,
|
||
"んだ" => 606,
|
||
"んで" => 798,
|
||
"カ月" => 990,
|
||
"会議" => 860,
|
||
"入り" => 1232,
|
||
"大会" => 2217,
|
||
"始め" => 1681,
|
||
"市" => 965,
|
||
"新聞" => -5055,
|
||
"日," => 974,
|
||
"日、" => 974,
|
||
"社会" => 2024,
|
||
"カ月" => 990,
|
||
}
|
||
TC1 = {
|
||
"AAA" => 1093,
|
||
"HHH" => 1029,
|
||
"HHM" => 580,
|
||
"HII" => 998,
|
||
"HOH" => -390,
|
||
"HOM" => -331,
|
||
"IHI" => 1169,
|
||
"IOH" => -142,
|
||
"IOI" => -1015,
|
||
"IOM" => 467,
|
||
"MMH" => 187,
|
||
"OOI" => -1832,
|
||
}
|
||
TC2 = {
|
||
"HHO" => 2088,
|
||
"HII" => -1023,
|
||
"HMM" => -1154,
|
||
"IHI" => -1965,
|
||
"KKH" => 703,
|
||
"OII" => -2649,
|
||
}
|
||
TC3 = {
|
||
"AAA" => -294,
|
||
"HHH" => 346,
|
||
"HHI" => -341,
|
||
"HII" => -1088,
|
||
"HIK" => 731,
|
||
"HOH" => -1486,
|
||
"IHH" => 128,
|
||
"IHI" => -3041,
|
||
"IHO" => -1935,
|
||
"IIH" => -825,
|
||
"IIM" => -1035,
|
||
"IOI" => -542,
|
||
"KHH" => -1216,
|
||
"KKA" => 491,
|
||
"KKH" => -1217,
|
||
"KOK" => -1009,
|
||
"MHH" => -2694,
|
||
"MHM" => -457,
|
||
"MHO" => 123,
|
||
"MMH" => -471,
|
||
"NNH" => -1689,
|
||
"NNO" => 662,
|
||
"OHO" => -3393,
|
||
}
|
||
TC4 = {
|
||
"HHH" => -203,
|
||
"HHI" => 1344,
|
||
"HHK" => 365,
|
||
"HHM" => -122,
|
||
"HHN" => 182,
|
||
"HHO" => 669,
|
||
"HIH" => 804,
|
||
"HII" => 679,
|
||
"HOH" => 446,
|
||
"IHH" => 695,
|
||
"IHO" => -2324,
|
||
"IIH" => 321,
|
||
"III" => 1497,
|
||
"IIO" => 656,
|
||
"IOO" => 54,
|
||
"KAK" => 4845,
|
||
"KKA" => 3386,
|
||
"KKK" => 3065,
|
||
"MHH" => -405,
|
||
"MHI" => 201,
|
||
"MMH" => -241,
|
||
"MMM" => 661,
|
||
"MOM" => 841,
|
||
}
|
||
TQ1 = {
|
||
"BHHH" => -227,
|
||
"BHHI" => 316,
|
||
"BHIH" => -132,
|
||
"BIHH" => 60,
|
||
"BIII" => 1595,
|
||
"BNHH" => -744,
|
||
"BOHH" => 225,
|
||
"BOOO" => -908,
|
||
"OAKK" => 482,
|
||
"OHHH" => 281,
|
||
"OHIH" => 249,
|
||
"OIHI" => 200,
|
||
"OIIH" => -68,
|
||
}
|
||
TQ2 = { "BIHH" => -1401, "BIII" => -1033, "BKAK" => -543, "BOOO" => -5591 }
|
||
TQ3 = {
|
||
"BHHH" => 478,
|
||
"BHHM" => -1073,
|
||
"BHIH" => 222,
|
||
"BHII" => -504,
|
||
"BIIH" => -116,
|
||
"BIII" => -105,
|
||
"BMHI" => -863,
|
||
"BMHM" => -464,
|
||
"BOMH" => 620,
|
||
"OHHH" => 346,
|
||
"OHHI" => 1729,
|
||
"OHII" => 997,
|
||
"OHMH" => 481,
|
||
"OIHH" => 623,
|
||
"OIIH" => 1344,
|
||
"OKAK" => 2792,
|
||
"OKHH" => 587,
|
||
"OKKA" => 679,
|
||
"OOHH" => 110,
|
||
"OOII" => -685,
|
||
}
|
||
TQ4 = {
|
||
"BHHH" => -721,
|
||
"BHHM" => -3604,
|
||
"BHII" => -966,
|
||
"BIIH" => -607,
|
||
"BIII" => -2181,
|
||
"OAAA" => -2763,
|
||
"OAKK" => 180,
|
||
"OHHH" => -294,
|
||
"OHHI" => 2446,
|
||
"OHHO" => 480,
|
||
"OHIH" => -1573,
|
||
"OIHH" => 1935,
|
||
"OIHI" => -493,
|
||
"OIIH" => 626,
|
||
"OIII" => -4007,
|
||
"OKAK" => -8156,
|
||
}
|
||
TW1 = { "につい" => -4681, "東京都" => 2026 }
|
||
TW2 = {
|
||
"ある程" => -2049,
|
||
"いった" => -1256,
|
||
"ころが" => -2434,
|
||
"しょう" => 3873,
|
||
"その後" => -4430,
|
||
"だって" => -1049,
|
||
"ていた" => 1833,
|
||
"として" => -4657,
|
||
"ともに" => -4517,
|
||
"もので" => 1882,
|
||
"一気に" => -792,
|
||
"初めて" => -1512,
|
||
"同時に" => -8097,
|
||
"大きな" => -1255,
|
||
"対して" => -2721,
|
||
"社会党" => -3216,
|
||
}
|
||
TW3 = {
|
||
"いただ" => -1734,
|
||
"してい" => 1314,
|
||
"として" => -4314,
|
||
"につい" => -5483,
|
||
"にとっ" => -5989,
|
||
"に当た" => -6247,
|
||
"ので," => -727,
|
||
"ので、" => -727,
|
||
"のもの" => -600,
|
||
"れから" => -3752,
|
||
"十二月" => -2287,
|
||
}
|
||
TW4 = {
|
||
"いう." => 8576,
|
||
"いう。" => 8576,
|
||
"からな" => -2348,
|
||
"してい" => 2958,
|
||
"たが," => 1516,
|
||
"たが、" => 1516,
|
||
"ている" => 1538,
|
||
"という" => 1349,
|
||
"ました" => 5543,
|
||
"ません" => 1097,
|
||
"ようと" => -4258,
|
||
"よると" => 5865,
|
||
}
|
||
UC1 = { "A" => 484, "K" => 93, "M" => 645, "O" => -505 }
|
||
UC2 = { "A" => 819, "H" => 1059, "I" => 409, "M" => 3987, "N" => 5775, "O" => 646 }
|
||
UC3 = { "A" => -1370, "I" => 2311 }
|
||
UC4 = {
|
||
"A" => -2643,
|
||
"H" => 1809,
|
||
"I" => -1032,
|
||
"K" => -3450,
|
||
"M" => 3565,
|
||
"N" => 3876,
|
||
"O" => 6646,
|
||
}
|
||
UC5 = { "H" => 313, "I" => -1238, "K" => -799, "M" => 539, "O" => -831 }
|
||
UC6 = { "H" => -506, "I" => -253, "K" => 87, "M" => 247, "O" => -387 }
|
||
UP1 = { "O" => -214 }
|
||
UP2 = { "B" => 69, "O" => 935 }
|
||
UP3 = { "B" => 189 }
|
||
UQ1 = {
|
||
"BH" => 21,
|
||
"BI" => -12,
|
||
"BK" => -99,
|
||
"BN" => 142,
|
||
"BO" => -56,
|
||
"OH" => -95,
|
||
"OI" => 477,
|
||
"OK" => 410,
|
||
"OO" => -2422,
|
||
}
|
||
UQ2 = { "BH" => 216, "BI" => 113, "OK" => 1759 }
|
||
UQ3 = {
|
||
"BA" => -479,
|
||
"BH" => 42,
|
||
"BI" => 1913,
|
||
"BK" => -7198,
|
||
"BM" => 3160,
|
||
"BN" => 6427,
|
||
"BO" => 14_761,
|
||
"OI" => -827,
|
||
"ON" => -3212,
|
||
}
|
||
UW1 = {
|
||
"," => 156,
|
||
"、" => 156,
|
||
"「" => -463,
|
||
"あ" => -941,
|
||
"う" => -127,
|
||
"が" => -553,
|
||
"き" => 121,
|
||
"こ" => 505,
|
||
"で" => -201,
|
||
"と" => -547,
|
||
"ど" => -123,
|
||
"に" => -789,
|
||
"の" => -185,
|
||
"は" => -847,
|
||
"も" => -466,
|
||
"や" => -470,
|
||
"よ" => 182,
|
||
"ら" => -292,
|
||
"り" => 208,
|
||
"れ" => 169,
|
||
"を" => -446,
|
||
"ん" => -137,
|
||
"・" => -135,
|
||
"主" => -402,
|
||
"京" => -268,
|
||
"区" => -912,
|
||
"午" => 871,
|
||
"国" => -460,
|
||
"大" => 561,
|
||
"委" => 729,
|
||
"市" => -411,
|
||
"日" => -141,
|
||
"理" => 361,
|
||
"生" => -408,
|
||
"県" => -386,
|
||
"都" => -718,
|
||
"「" => -463,
|
||
"・" => -135,
|
||
}
|
||
UW2 = {
|
||
"," => -829,
|
||
"、" => -829,
|
||
"〇" => 892,
|
||
"「" => -645,
|
||
"」" => 3145,
|
||
"あ" => -538,
|
||
"い" => 505,
|
||
"う" => 134,
|
||
"お" => -502,
|
||
"か" => 1454,
|
||
"が" => -856,
|
||
"く" => -412,
|
||
"こ" => 1141,
|
||
"さ" => 878,
|
||
"ざ" => 540,
|
||
"し" => 1529,
|
||
"す" => -675,
|
||
"せ" => 300,
|
||
"そ" => -1011,
|
||
"た" => 188,
|
||
"だ" => 1837,
|
||
"つ" => -949,
|
||
"て" => -291,
|
||
"で" => -268,
|
||
"と" => -981,
|
||
"ど" => 1273,
|
||
"な" => 1063,
|
||
"に" => -1764,
|
||
"の" => 130,
|
||
"は" => -409,
|
||
"ひ" => -1273,
|
||
"べ" => 1261,
|
||
"ま" => 600,
|
||
"も" => -1263,
|
||
"や" => -402,
|
||
"よ" => 1639,
|
||
"り" => -579,
|
||
"る" => -694,
|
||
"れ" => 571,
|
||
"を" => -2516,
|
||
"ん" => 2095,
|
||
"ア" => -587,
|
||
"カ" => 306,
|
||
"キ" => 568,
|
||
"ッ" => 831,
|
||
"三" => -758,
|
||
"不" => -2150,
|
||
"世" => -302,
|
||
"中" => -968,
|
||
"主" => -861,
|
||
"事" => 492,
|
||
"人" => -123,
|
||
"会" => 978,
|
||
"保" => 362,
|
||
"入" => 548,
|
||
"初" => -3025,
|
||
"副" => -1566,
|
||
"北" => -3414,
|
||
"区" => -422,
|
||
"大" => -1769,
|
||
"天" => -865,
|
||
"太" => -483,
|
||
"子" => -1519,
|
||
"学" => 760,
|
||
"実" => 1023,
|
||
"小" => -2009,
|
||
"市" => -813,
|
||
"年" => -1060,
|
||
"強" => 1067,
|
||
"手" => -1519,
|
||
"揺" => -1033,
|
||
"政" => 1522,
|
||
"文" => -1355,
|
||
"新" => -1682,
|
||
"日" => -1815,
|
||
"明" => -1462,
|
||
"最" => -630,
|
||
"朝" => -1843,
|
||
"本" => -1650,
|
||
"東" => -931,
|
||
"果" => -665,
|
||
"次" => -2378,
|
||
"民" => -180,
|
||
"気" => -1740,
|
||
"理" => 752,
|
||
"発" => 529,
|
||
"目" => -1584,
|
||
"相" => -242,
|
||
"県" => -1165,
|
||
"立" => -763,
|
||
"第" => 810,
|
||
"米" => 509,
|
||
"自" => -1353,
|
||
"行" => 838,
|
||
"西" => -744,
|
||
"見" => -3874,
|
||
"調" => 1010,
|
||
"議" => 1198,
|
||
"込" => 3041,
|
||
"開" => 1758,
|
||
"間" => -1257,
|
||
"「" => -645,
|
||
"」" => 3145,
|
||
"ッ" => 831,
|
||
"ア" => -587,
|
||
"カ" => 306,
|
||
"キ" => 568,
|
||
}
|
||
UW3 = {
|
||
"," => 4889,
|
||
"1" => -800,
|
||
"−" => -1723,
|
||
"、" => 4889,
|
||
"々" => -2311,
|
||
"〇" => 5827,
|
||
"」" => 2670,
|
||
"〓" => -3573,
|
||
"あ" => -2696,
|
||
"い" => 1006,
|
||
"う" => 2342,
|
||
"え" => 1983,
|
||
"お" => -4864,
|
||
"か" => -1163,
|
||
"が" => 3271,
|
||
"く" => 1004,
|
||
"け" => 388,
|
||
"げ" => 401,
|
||
"こ" => -3552,
|
||
"ご" => -3116,
|
||
"さ" => -1058,
|
||
"し" => -395,
|
||
"す" => 584,
|
||
"せ" => 3685,
|
||
"そ" => -5228,
|
||
"た" => 842,
|
||
"ち" => -521,
|
||
"っ" => -1444,
|
||
"つ" => -1081,
|
||
"て" => 6167,
|
||
"で" => 2318,
|
||
"と" => 1691,
|
||
"ど" => -899,
|
||
"な" => -2788,
|
||
"に" => 2745,
|
||
"の" => 4056,
|
||
"は" => 4555,
|
||
"ひ" => -2171,
|
||
"ふ" => -1798,
|
||
"へ" => 1199,
|
||
"ほ" => -5516,
|
||
"ま" => -4384,
|
||
"み" => -120,
|
||
"め" => 1205,
|
||
"も" => 2323,
|
||
"や" => -788,
|
||
"よ" => -202,
|
||
"ら" => 727,
|
||
"り" => 649,
|
||
"る" => 5905,
|
||
"れ" => 2773,
|
||
"わ" => -1207,
|
||
"を" => 6620,
|
||
"ん" => -518,
|
||
"ア" => 551,
|
||
"グ" => 1319,
|
||
"ス" => 874,
|
||
"ッ" => -1350,
|
||
"ト" => 521,
|
||
"ム" => 1109,
|
||
"ル" => 1591,
|
||
"ロ" => 2201,
|
||
"ン" => 278,
|
||
"・" => -3794,
|
||
"一" => -1619,
|
||
"下" => -1759,
|
||
"世" => -2087,
|
||
"両" => 3815,
|
||
"中" => 653,
|
||
"主" => -758,
|
||
"予" => -1193,
|
||
"二" => 974,
|
||
"人" => 2742,
|
||
"今" => 792,
|
||
"他" => 1889,
|
||
"以" => -1368,
|
||
"低" => 811,
|
||
"何" => 4265,
|
||
"作" => -361,
|
||
"保" => -2439,
|
||
"元" => 4858,
|
||
"党" => 3593,
|
||
"全" => 1574,
|
||
"公" => -3030,
|
||
"六" => 755,
|
||
"共" => -1880,
|
||
"円" => 5807,
|
||
"再" => 3095,
|
||
"分" => 457,
|
||
"初" => 2475,
|
||
"別" => 1129,
|
||
"前" => 2286,
|
||
"副" => 4437,
|
||
"力" => 365,
|
||
"動" => -949,
|
||
"務" => -1872,
|
||
"化" => 1327,
|
||
"北" => -1038,
|
||
"区" => 4646,
|
||
"千" => -2309,
|
||
"午" => -783,
|
||
"協" => -1006,
|
||
"口" => 483,
|
||
"右" => 1233,
|
||
"各" => 3588,
|
||
"合" => -241,
|
||
"同" => 3906,
|
||
"和" => -837,
|
||
"員" => 4513,
|
||
"国" => 642,
|
||
"型" => 1389,
|
||
"場" => 1219,
|
||
"外" => -241,
|
||
"妻" => 2016,
|
||
"学" => -1356,
|
||
"安" => -423,
|
||
"実" => -1008,
|
||
"家" => 1078,
|
||
"小" => -513,
|
||
"少" => -3102,
|
||
"州" => 1155,
|
||
"市" => 3197,
|
||
"平" => -1804,
|
||
"年" => 2416,
|
||
"広" => -1030,
|
||
"府" => 1605,
|
||
"度" => 1452,
|
||
"建" => -2352,
|
||
"当" => -3885,
|
||
"得" => 1905,
|
||
"思" => -1291,
|
||
"性" => 1822,
|
||
"戸" => -488,
|
||
"指" => -3973,
|
||
"政" => -2013,
|
||
"教" => -1479,
|
||
"数" => 3222,
|
||
"文" => -1489,
|
||
"新" => 1764,
|
||
"日" => 2099,
|
||
"旧" => 5792,
|
||
"昨" => -661,
|
||
"時" => -1248,
|
||
"曜" => -951,
|
||
"最" => -937,
|
||
"月" => 4125,
|
||
"期" => 360,
|
||
"李" => 3094,
|
||
"村" => 364,
|
||
"東" => -805,
|
||
"核" => 5156,
|
||
"森" => 2438,
|
||
"業" => 484,
|
||
"氏" => 2613,
|
||
"民" => -1694,
|
||
"決" => -1073,
|
||
"法" => 1868,
|
||
"海" => -495,
|
||
"無" => 979,
|
||
"物" => 461,
|
||
"特" => -3850,
|
||
"生" => -273,
|
||
"用" => 914,
|
||
"町" => 1215,
|
||
"的" => 7313,
|
||
"直" => -1835,
|
||
"省" => 792,
|
||
"県" => 6293,
|
||
"知" => -1528,
|
||
"私" => 4231,
|
||
"税" => 401,
|
||
"立" => -960,
|
||
"第" => 1201,
|
||
"米" => 7767,
|
||
"系" => 3066,
|
||
"約" => 3663,
|
||
"級" => 1384,
|
||
"統" => -4229,
|
||
"総" => 1163,
|
||
"線" => 1255,
|
||
"者" => 6457,
|
||
"能" => 725,
|
||
"自" => -2869,
|
||
"英" => 785,
|
||
"見" => 1044,
|
||
"調" => -562,
|
||
"財" => -733,
|
||
"費" => 1777,
|
||
"車" => 1835,
|
||
"軍" => 1375,
|
||
"込" => -1504,
|
||
"通" => -1136,
|
||
"選" => -681,
|
||
"郎" => 1026,
|
||
"郡" => 4404,
|
||
"部" => 1200,
|
||
"金" => 2163,
|
||
"長" => 421,
|
||
"開" => -1432,
|
||
"間" => 1302,
|
||
"関" => -1282,
|
||
"雨" => 2009,
|
||
"電" => -1045,
|
||
"非" => 2066,
|
||
"駅" => 1620,
|
||
"1" => -800,
|
||
"」" => 2670,
|
||
"・" => -3794,
|
||
"ッ" => -1350,
|
||
"ア" => 551,
|
||
"グ" => 1319,
|
||
"ス" => 874,
|
||
"ト" => 521,
|
||
"ム" => 1109,
|
||
"ル" => 1591,
|
||
"ロ" => 2201,
|
||
"ン" => 278,
|
||
}
|
||
UW4 = {
|
||
"," => 3930,
|
||
"." => 3508,
|
||
"―" => -4841,
|
||
"、" => 3930,
|
||
"。" => 3508,
|
||
"〇" => 4999,
|
||
"「" => 1895,
|
||
"」" => 3798,
|
||
"〓" => -5156,
|
||
"あ" => 4752,
|
||
"い" => -3435,
|
||
"う" => -640,
|
||
"え" => -2514,
|
||
"お" => 2405,
|
||
"か" => 530,
|
||
"が" => 6006,
|
||
"き" => -4482,
|
||
"ぎ" => -3821,
|
||
"く" => -3788,
|
||
"け" => -4376,
|
||
"げ" => -4734,
|
||
"こ" => 2255,
|
||
"ご" => 1979,
|
||
"さ" => 2864,
|
||
"し" => -843,
|
||
"じ" => -2506,
|
||
"す" => -731,
|
||
"ず" => 1251,
|
||
"せ" => 181,
|
||
"そ" => 4091,
|
||
"た" => 5034,
|
||
"だ" => 5408,
|
||
"ち" => -3654,
|
||
"っ" => -5882,
|
||
"つ" => -1659,
|
||
"て" => 3994,
|
||
"で" => 7410,
|
||
"と" => 4547,
|
||
"な" => 5433,
|
||
"に" => 6499,
|
||
"ぬ" => 1853,
|
||
"ね" => 1413,
|
||
"の" => 7396,
|
||
"は" => 8578,
|
||
"ば" => 1940,
|
||
"ひ" => 4249,
|
||
"び" => -4134,
|
||
"ふ" => 1345,
|
||
"へ" => 6665,
|
||
"べ" => -744,
|
||
"ほ" => 1464,
|
||
"ま" => 1051,
|
||
"み" => -2082,
|
||
"む" => -882,
|
||
"め" => -5046,
|
||
"も" => 4169,
|
||
"ゃ" => -2666,
|
||
"や" => 2795,
|
||
"ょ" => -1544,
|
||
"よ" => 3351,
|
||
"ら" => -2922,
|
||
"り" => -9726,
|
||
"る" => -14_896,
|
||
"れ" => -2613,
|
||
"ろ" => -4570,
|
||
"わ" => -1783,
|
||
"を" => 13_150,
|
||
"ん" => -2352,
|
||
"カ" => 2145,
|
||
"コ" => 1789,
|
||
"セ" => 1287,
|
||
"ッ" => -724,
|
||
"ト" => -403,
|
||
"メ" => -1635,
|
||
"ラ" => -881,
|
||
"リ" => -541,
|
||
"ル" => -856,
|
||
"ン" => -3637,
|
||
"・" => -4371,
|
||
"ー" => -11_870,
|
||
"一" => -2069,
|
||
"中" => 2210,
|
||
"予" => 782,
|
||
"事" => -190,
|
||
"井" => -1768,
|
||
"人" => 1036,
|
||
"以" => 544,
|
||
"会" => 950,
|
||
"体" => -1286,
|
||
"作" => 530,
|
||
"側" => 4292,
|
||
"先" => 601,
|
||
"党" => -2006,
|
||
"共" => -1212,
|
||
"内" => 584,
|
||
"円" => 788,
|
||
"初" => 1347,
|
||
"前" => 1623,
|
||
"副" => 3879,
|
||
"力" => -302,
|
||
"動" => -740,
|
||
"務" => -2715,
|
||
"化" => 776,
|
||
"区" => 4517,
|
||
"協" => 1013,
|
||
"参" => 1555,
|
||
"合" => -1834,
|
||
"和" => -681,
|
||
"員" => -910,
|
||
"器" => -851,
|
||
"回" => 1500,
|
||
"国" => -619,
|
||
"園" => -1200,
|
||
"地" => 866,
|
||
"場" => -1410,
|
||
"塁" => -2094,
|
||
"士" => -1413,
|
||
"多" => 1067,
|
||
"大" => 571,
|
||
"子" => -4802,
|
||
"学" => -1397,
|
||
"定" => -1057,
|
||
"寺" => -809,
|
||
"小" => 1910,
|
||
"屋" => -1328,
|
||
"山" => -1500,
|
||
"島" => -2056,
|
||
"川" => -2667,
|
||
"市" => 2771,
|
||
"年" => 374,
|
||
"庁" => -4556,
|
||
"後" => 456,
|
||
"性" => 553,
|
||
"感" => 916,
|
||
"所" => -1566,
|
||
"支" => 856,
|
||
"改" => 787,
|
||
"政" => 2182,
|
||
"教" => 704,
|
||
"文" => 522,
|
||
"方" => -856,
|
||
"日" => 1798,
|
||
"時" => 1829,
|
||
"最" => 845,
|
||
"月" => -9066,
|
||
"木" => -485,
|
||
"来" => -442,
|
||
"校" => -360,
|
||
"業" => -1043,
|
||
"氏" => 5388,
|
||
"民" => -2716,
|
||
"気" => -910,
|
||
"沢" => -939,
|
||
"済" => -543,
|
||
"物" => -735,
|
||
"率" => 672,
|
||
"球" => -1267,
|
||
"生" => -1286,
|
||
"産" => -1101,
|
||
"田" => -2900,
|
||
"町" => 1826,
|
||
"的" => 2586,
|
||
"目" => 922,
|
||
"省" => -3485,
|
||
"県" => 2997,
|
||
"空" => -867,
|
||
"立" => -2112,
|
||
"第" => 788,
|
||
"米" => 2937,
|
||
"系" => 786,
|
||
"約" => 2171,
|
||
"経" => 1146,
|
||
"統" => -1169,
|
||
"総" => 940,
|
||
"線" => -994,
|
||
"署" => 749,
|
||
"者" => 2145,
|
||
"能" => -730,
|
||
"般" => -852,
|
||
"行" => -792,
|
||
"規" => 792,
|
||
"警" => -1184,
|
||
"議" => -244,
|
||
"谷" => -1000,
|
||
"賞" => 730,
|
||
"車" => -1481,
|
||
"軍" => 1158,
|
||
"輪" => -1433,
|
||
"込" => -3370,
|
||
"近" => 929,
|
||
"道" => -1291,
|
||
"選" => 2596,
|
||
"郎" => -4866,
|
||
"都" => 1192,
|
||
"野" => -1100,
|
||
"銀" => -2213,
|
||
"長" => 357,
|
||
"間" => -2344,
|
||
"院" => -2297,
|
||
"際" => -2604,
|
||
"電" => -878,
|
||
"領" => -1659,
|
||
"題" => -792,
|
||
"館" => -1984,
|
||
"首" => 1749,
|
||
"高" => 2120,
|
||
"「" => 1895,
|
||
"」" => 3798,
|
||
"・" => -4371,
|
||
"ッ" => -724,
|
||
"ー" => -11_870,
|
||
"カ" => 2145,
|
||
"コ" => 1789,
|
||
"セ" => 1287,
|
||
"ト" => -403,
|
||
"メ" => -1635,
|
||
"ラ" => -881,
|
||
"リ" => -541,
|
||
"ル" => -856,
|
||
"ン" => -3637,
|
||
}
|
||
UW5 = {
|
||
"," => 465,
|
||
"." => -299,
|
||
"1" => -514,
|
||
"E2" => -32_768,
|
||
"]" => -2762,
|
||
"、" => 465,
|
||
"。" => -299,
|
||
"「" => 363,
|
||
"あ" => 1655,
|
||
"い" => 331,
|
||
"う" => -503,
|
||
"え" => 1199,
|
||
"お" => 527,
|
||
"か" => 647,
|
||
"が" => -421,
|
||
"き" => 1624,
|
||
"ぎ" => 1971,
|
||
"く" => 312,
|
||
"げ" => -983,
|
||
"さ" => -1537,
|
||
"し" => -1371,
|
||
"す" => -852,
|
||
"だ" => -1186,
|
||
"ち" => 1093,
|
||
"っ" => 52,
|
||
"つ" => 921,
|
||
"て" => -18,
|
||
"で" => -850,
|
||
"と" => -127,
|
||
"ど" => 1682,
|
||
"な" => -787,
|
||
"に" => -1224,
|
||
"の" => -635,
|
||
"は" => -578,
|
||
"べ" => 1001,
|
||
"み" => 502,
|
||
"め" => 865,
|
||
"ゃ" => 3350,
|
||
"ょ" => 854,
|
||
"り" => -208,
|
||
"る" => 429,
|
||
"れ" => 504,
|
||
"わ" => 419,
|
||
"を" => -1264,
|
||
"ん" => 327,
|
||
"イ" => 241,
|
||
"ル" => 451,
|
||
"ン" => -343,
|
||
"中" => -871,
|
||
"京" => 722,
|
||
"会" => -1153,
|
||
"党" => -654,
|
||
"務" => 3519,
|
||
"区" => -901,
|
||
"告" => 848,
|
||
"員" => 2104,
|
||
"大" => -1296,
|
||
"学" => -548,
|
||
"定" => 1785,
|
||
"嵐" => -1304,
|
||
"市" => -2991,
|
||
"席" => 921,
|
||
"年" => 1763,
|
||
"思" => 872,
|
||
"所" => -814,
|
||
"挙" => 1618,
|
||
"新" => -1682,
|
||
"日" => 218,
|
||
"月" => -4353,
|
||
"査" => 932,
|
||
"格" => 1356,
|
||
"機" => -1508,
|
||
"氏" => -1347,
|
||
"田" => 240,
|
||
"町" => -3912,
|
||
"的" => -3149,
|
||
"相" => 1319,
|
||
"省" => -1052,
|
||
"県" => -4003,
|
||
"研" => -997,
|
||
"社" => -278,
|
||
"空" => -813,
|
||
"統" => 1955,
|
||
"者" => -2233,
|
||
"表" => 663,
|
||
"語" => -1073,
|
||
"議" => 1219,
|
||
"選" => -1018,
|
||
"郎" => -368,
|
||
"長" => 786,
|
||
"間" => 1191,
|
||
"題" => 2368,
|
||
"館" => -689,
|
||
"1" => -514,
|
||
"E2" => -32_768,
|
||
"「" => 363,
|
||
"イ" => 241,
|
||
"ル" => 451,
|
||
"ン" => -343,
|
||
}
|
||
UW6 = {
|
||
"," => 227,
|
||
"." => 808,
|
||
"1" => -270,
|
||
"E1" => 306,
|
||
"、" => 227,
|
||
"。" => 808,
|
||
"あ" => -307,
|
||
"う" => 189,
|
||
"か" => 241,
|
||
"が" => -73,
|
||
"く" => -121,
|
||
"こ" => -200,
|
||
"じ" => 1782,
|
||
"す" => 383,
|
||
"た" => -428,
|
||
"っ" => 573,
|
||
"て" => -1014,
|
||
"で" => 101,
|
||
"と" => -105,
|
||
"な" => -253,
|
||
"に" => -149,
|
||
"の" => -417,
|
||
"は" => -236,
|
||
"も" => -206,
|
||
"り" => 187,
|
||
"る" => -135,
|
||
"を" => 195,
|
||
"ル" => -673,
|
||
"ン" => -496,
|
||
"一" => -277,
|
||
"中" => 201,
|
||
"件" => -800,
|
||
"会" => 624,
|
||
"前" => 302,
|
||
"区" => 1792,
|
||
"員" => -1212,
|
||
"委" => 798,
|
||
"学" => -960,
|
||
"市" => 887,
|
||
"広" => -695,
|
||
"後" => 535,
|
||
"業" => -697,
|
||
"相" => 753,
|
||
"社" => -507,
|
||
"福" => 974,
|
||
"空" => -822,
|
||
"者" => 1811,
|
||
"連" => 463,
|
||
"郎" => 1082,
|
||
"1" => -270,
|
||
"E1" => 306,
|
||
"ル" => -673,
|
||
"ン" => -496,
|
||
}
|
||
|
||
class << self
|
||
def segment(text)
|
||
return [] if text.nil? || text.strip.length == 0
|
||
|
||
result = []
|
||
|
||
segments = %w[B3 B2 B1]
|
||
ctypes = %w[O O O]
|
||
|
||
text.chars.each do |char|
|
||
segments << char
|
||
ctypes << ctype(char)
|
||
end
|
||
|
||
segments.concat(%w[E1 E2 E3])
|
||
ctypes.concat(%w[O O O])
|
||
|
||
word = segments[3]
|
||
p1 = "U"
|
||
p2 = "U"
|
||
p3 = "U"
|
||
|
||
4.upto(segments.size - 4) do |i|
|
||
score = BIAS
|
||
w1 = segments[i - 3]
|
||
w2 = segments[i - 2]
|
||
w3 = segments[i - 1]
|
||
w4 = segments[i]
|
||
w5 = segments[i + 1]
|
||
w6 = segments[i + 2]
|
||
c1 = ctypes[i - 3]
|
||
c2 = ctypes[i - 2]
|
||
c3 = ctypes[i - 1]
|
||
c4 = ctypes[i]
|
||
c5 = ctypes[i + 1]
|
||
c6 = ctypes[i + 2]
|
||
score += UP1[p1].to_i
|
||
score += UP2[p2].to_i
|
||
score += UP3[p3].to_i
|
||
score += BP1[p1 + p2].to_i
|
||
score += BP2[p2 + p3].to_i
|
||
score += UW1[w1].to_i
|
||
score += UW2[w2].to_i
|
||
score += UW3[w3].to_i
|
||
score += UW4[w4].to_i
|
||
score += UW5[w5].to_i
|
||
score += UW6[w6].to_i
|
||
score += BW1[w2 + w3].to_i
|
||
score += BW2[w3 + w4].to_i
|
||
score += BW3[w4 + w5].to_i
|
||
score += TW1[w1 + w2 + w3].to_i
|
||
score += TW2[w2 + w3 + w4].to_i
|
||
score += TW3[w3 + w4 + w5].to_i
|
||
score += TW4[w4 + w5 + w6].to_i
|
||
score += UC1[c1].to_i
|
||
score += UC2[c2].to_i
|
||
score += UC3[c3].to_i
|
||
score += UC4[c4].to_i
|
||
score += UC5[c5].to_i
|
||
score += UC6[c6].to_i
|
||
score += BC1[c2 + c3].to_i
|
||
score += BC2[c3 + c4].to_i
|
||
score += BC3[c4 + c5].to_i
|
||
score += TC1[c1 + c2 + c3].to_i
|
||
score += TC2[c2 + c3 + c4].to_i
|
||
score += TC3[c3 + c4 + c5].to_i
|
||
score += TC4[c4 + c5 + c6].to_i
|
||
# score += TC5[c4 + c5 + c6].to_i
|
||
score += UQ1[p1 + c1].to_i
|
||
score += UQ2[p2 + c2].to_i
|
||
score += UQ3[p3 + c3].to_i
|
||
score += BQ1[p2 + c2 + c3].to_i
|
||
score += BQ2[p2 + c3 + c4].to_i
|
||
score += BQ3[p3 + c2 + c3].to_i
|
||
score += BQ4[p3 + c3 + c4].to_i
|
||
score += TQ1[p2 + c1 + c2 + c3].to_i
|
||
score += TQ2[p2 + c2 + c3 + c4].to_i
|
||
score += TQ3[p3 + c1 + c2 + c3].to_i
|
||
score += TQ4[p3 + c2 + c3 + c4].to_i
|
||
|
||
p = "O"
|
||
|
||
if score > 0
|
||
result.push(word)
|
||
word = ""
|
||
p = "B"
|
||
end
|
||
|
||
p1 = p2
|
||
p2 = p3
|
||
p3 = p
|
||
word += segments[i]
|
||
end
|
||
|
||
result.push(word)
|
||
|
||
result
|
||
end
|
||
|
||
private
|
||
|
||
def ctype(text)
|
||
CHARTYPE.each { |regexp, value| return value if text.match(regexp) }
|
||
|
||
"O"
|
||
end
|
||
end
|
||
end
|