kana.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. /**
  2. * This module is mainly for handling romaji input to match the provided kana
  3. * input. While most kana map one-to-one with romaji, some kana have multiple
  4. * ways to be inputted. In addition, we also have to handle っ which causes the
  5. * next consonant to be repeated.
  6. *
  7. * The state management is done by having a state machine for each kana and it
  8. * should handle all possible variations of the romaji to be inputted.
  9. * Additionally, it also keeps track of what is left to be input, and adjusts
  10. * itself accordingly if an alternative romaji was used.
  11. *
  12. * One of the key considerations is handling っ. It doesn't have a spelling in
  13. * and of itself, but just modifies the state machine that will come after it.
  14. * Intermediate states need to be created and care should be given in what shows
  15. * up in the display.
  16. */
  17. import * as state from './state';
  18. import {
  19. State,
  20. StateMachine,
  21. makeTransition as t,
  22. } from './state';
  23. function literal(source: string, ...extraBoundaries: number[]): StateMachine {
  24. let transitions: state.Transition[] = [];
  25. for (let i = 0; i < source.length; ++i) {
  26. let from = source.substring(i);
  27. let input = source.charAt(i);
  28. let to = source.substring(i+1);
  29. let boundary = i === (source.length - 1) || extraBoundaries.indexOf(i) >= 0;
  30. transitions.push(t(from, input, to, boundary));
  31. }
  32. return state.buildFromTransitions(source, transitions);
  33. }
  34. function shi(): StateMachine {
  35. return state.buildFromTransitions('shi', [
  36. t('shi', 's', 'hi'),
  37. t('hi', 'h', 'i'),
  38. t('hi', 'i', '', true),
  39. t('i', 'i', '', true)
  40. ]);
  41. }
  42. function chi(): StateMachine {
  43. return state.buildFromTransitions('chi', [
  44. t('chi', 'c', 'hi'),
  45. t('chi', 't', 'i'),
  46. t('hi', 'h', 'i'),
  47. t('i', 'i', '', true)
  48. ]);
  49. }
  50. function tsu(): StateMachine {
  51. return state.buildFromTransitions('tsu', [
  52. t('tsu', 't', 'su'),
  53. t('su', 's', 'u'),
  54. t('su', 'u', '', true),
  55. t('u', 'u', '', true)
  56. ]);
  57. }
  58. function fu(): StateMachine {
  59. return state.buildFromTransitions('fu', [
  60. t('fu', 'f', 'u'),
  61. t('fu', 'h', 'u'),
  62. t('u', 'u', '', true)
  63. ]);
  64. }
  65. function ji(): StateMachine {
  66. return state.buildFromTransitions('ji', [
  67. t('ji', 'j', 'i'),
  68. t('ji', 'z', 'i'),
  69. t('i', 'i', '', true)
  70. ]);
  71. }
  72. function sh(end: string): StateMachine {
  73. let source = 'sh' + end;
  74. let middle = 'h' + end;
  75. return state.buildFromTransitions(source, [
  76. t(source, 's', middle, true),
  77. t(middle, 'h', end),
  78. t(middle, 'y', end),
  79. t(end, end, '', true)
  80. ]);
  81. }
  82. function ch(end: string): StateMachine {
  83. let source = 'ch' + end;
  84. let middle = 'h' + end;
  85. let altMiddle = 'y' + end;
  86. return state.buildFromTransitions(source, [
  87. t(source, 'c', middle),
  88. t(middle, 'h', end, true),
  89. t(source, 't', altMiddle, true),
  90. t(altMiddle, 'y', end),
  91. t(end, end, '', true)
  92. ]);
  93. }
  94. function j(end: string): StateMachine {
  95. let source = 'j' + end;
  96. let altMiddle = 'y' + end;
  97. return state.buildFromTransitions(source, [
  98. t(source, 'j', end, true),
  99. t(source, 'z', altMiddle),
  100. t(end, 'y', end),
  101. t(altMiddle, 'y', end, true),
  102. t(end, end, '', true)
  103. ]);
  104. }
  105. function smallTsu(base: StateMachine): StateMachine {
  106. let { display, transitions } = base.initialState;
  107. let newState = new State(display.charAt(0) + display);
  108. Object.keys(transitions).forEach(k => {
  109. let [nextState, _] = transitions[k];
  110. let intermediateDisplay = k + nextState.display;
  111. let intermediateState = new State(intermediateDisplay);
  112. intermediateState.addTransition(k, nextState);
  113. newState.addTransition(k, intermediateState, true);
  114. })
  115. return new StateMachine(newState, base.finalState);
  116. }
  117. function smallKana(base: StateMachine): StateMachine {
  118. let newState = base.initialState.clone();
  119. newState.addTransition('l', base.initialState);
  120. newState.addTransition('x', base.initialState);
  121. return new StateMachine(newState, base.finalState);
  122. }
  123. interface KanaMapping {
  124. [index: string]: StateMachine
  125. }
  126. interface StringMapping {
  127. [index: string]: string
  128. }
  129. const WHITESPACE = state.buildFromTransitions('_', [
  130. t('_', '_', ''),
  131. t('_', ' ', '')
  132. ]);
  133. const KATAKANA_MAPPING: StringMapping = {
  134. "ア": "あ",
  135. "イ": "い",
  136. "ウ": "う",
  137. "エ": "え",
  138. "オ": "お",
  139. "カ": "か",
  140. "キ": "き",
  141. "ク": "く",
  142. "ケ": "け",
  143. "コ": "こ",
  144. "サ": "さ",
  145. "シ": "し",
  146. "ス": "す",
  147. "セ": "せ",
  148. "ソ": "そ",
  149. "タ": "た",
  150. "チ": "ち",
  151. "ツ": "つ",
  152. "テ": "て",
  153. "ト": "と",
  154. "ナ": "な",
  155. "ニ": "に",
  156. "ヌ": "ぬ",
  157. "ネ": "ね",
  158. "ノ": "の",
  159. "ハ": "は",
  160. "ヒ": "ひ",
  161. "フ": "ふ",
  162. "ヘ": "へ",
  163. "ホ": "ほ",
  164. "マ": "ま",
  165. "ミ": "み",
  166. "ム": "む",
  167. "メ": "め",
  168. "モ": "も",
  169. "ヤ": "や",
  170. "ユ": "ゆ",
  171. "ヨ": "よ",
  172. "ラ": "ら",
  173. "リ": "り",
  174. "ル": "る",
  175. "レ": "れ",
  176. "ロ": "ろ",
  177. "ワ": "わ",
  178. "ヰ": "ゐ",
  179. "ヱ": "ゑ",
  180. "ヲ": "を",
  181. "ン": "ん",
  182. "ガ": "が",
  183. "ギ": "ぎ",
  184. "グ": "ぐ",
  185. "ゲ": "げ",
  186. "ゴ": "ご",
  187. "ザ": "ざ",
  188. "ジ": "じ",
  189. "ズ": "ず",
  190. "ゼ": "ぜ",
  191. "ゾ": "ぞ",
  192. "ダ": "だ",
  193. "ヂ": "ぢ",
  194. "ヅ": "づ",
  195. "デ": "で",
  196. "ド": "ど",
  197. "バ": "ば",
  198. "ビ": "び",
  199. "ブ": "ぶ",
  200. "ベ": "べ",
  201. "ボ": "ぼ",
  202. "パ": "ぱ",
  203. "ピ": "ぴ",
  204. "プ": "ぷ",
  205. "ペ": "ぺ",
  206. "ポ": "ぽ",
  207. "ヴ": "ゔ",
  208. "ァ": "ぁ",
  209. "ィ": "ぃ",
  210. "ゥ": "ぅ",
  211. "ェ": "ぇ",
  212. "ォ": "ぉ",
  213. "ャ": "ゃ",
  214. "ュ": "ゅ",
  215. "ョ": "ょ",
  216. "ッ": "っ"
  217. }
  218. const SINGLE_KANA_MAPPING: KanaMapping = {
  219. "あ": literal('a'),
  220. "い": literal('i'),
  221. "う": literal('u'),
  222. "え": literal('e'),
  223. "お": literal('o'),
  224. "か": literal('ka'),
  225. "き": literal('ki'),
  226. "く": literal('ku'),
  227. "け": literal('ke'),
  228. "こ": literal('ko'),
  229. "さ": literal('sa'),
  230. "し": shi(),
  231. "す": literal('su'),
  232. "せ": literal('se'),
  233. "そ": literal('so'),
  234. "た": literal('ta'),
  235. "ち": chi(),
  236. "つ": tsu(),
  237. "て": literal('te'),
  238. "と": literal('to'),
  239. "な": literal('na'),
  240. "に": literal('ni'),
  241. "ぬ": literal('nu'),
  242. "ね": literal('ne'),
  243. "の": literal('no'),
  244. "は": literal('ha'),
  245. "ひ": literal('hi'),
  246. "ふ": fu(),
  247. "へ": literal('he'),
  248. "ほ": literal('ho'),
  249. "ま": literal('ma'),
  250. "み": literal('mi'),
  251. "む": literal('mu'),
  252. "め": literal('me'),
  253. "も": literal('mo'),
  254. "や": literal('ya'),
  255. "ゆ": literal('yu'),
  256. "よ": literal('yo'),
  257. "ら": literal('ra'),
  258. "り": literal('ri'),
  259. "る": literal('ru'),
  260. "れ": literal('re'),
  261. "ろ": literal('ro'),
  262. "わ": literal('wa'),
  263. "ゐ": literal('i'),
  264. "ゑ": literal('e'),
  265. "を": literal('wo'),
  266. "ん": literal('n'),
  267. "が": literal('ga'),
  268. "ぎ": literal('gi'),
  269. "ぐ": literal('gu'),
  270. "げ": literal('ge'),
  271. "ご": literal('go'),
  272. "ざ": literal('za'),
  273. "じ": ji(),
  274. "ず": literal('zu'),
  275. "ぜ": literal('ze'),
  276. "ぞ": literal('zo'),
  277. "だ": literal('da'),
  278. "ぢ": literal('di'),
  279. "づ": literal('du'),
  280. "で": literal('de'),
  281. "ど": literal('do'),
  282. "ば": literal('ba'),
  283. "び": literal('bi'),
  284. "ぶ": literal('bu'),
  285. "べ": literal('be'),
  286. "ぼ": literal('bo'),
  287. "ぱ": literal('pa'),
  288. "ぴ": literal('pi'),
  289. "ぷ": literal('pu'),
  290. "ぺ": literal('pe'),
  291. "ぽ": literal('po'),
  292. "ゔ": literal('vu'),
  293. "ー": literal('-'),
  294. " ": WHITESPACE
  295. };
  296. 'abcdefghijklmnopqrstuvwxyz'.split('').forEach(letter => {
  297. SINGLE_KANA_MAPPING[letter] = literal(letter);
  298. });
  299. [
  300. ['ぁ', 'あ'],
  301. ['ぃ', 'い'],
  302. ['ぅ', 'う'],
  303. ['ぇ', 'え'],
  304. ['ぉ', 'お'],
  305. ['ヵ', 'か']
  306. ].forEach(pair => {
  307. let [ small, big ] = pair;
  308. SINGLE_KANA_MAPPING[small] = smallKana(SINGLE_KANA_MAPPING[big]);
  309. });
  310. const DOUBLE_KANA_MAPPING: KanaMapping = {
  311. "きゃ": literal('kya', 0),
  312. "きゅ": literal('kyu', 0),
  313. "きょ": literal('kyo', 0),
  314. "しゃ": sh('a'),
  315. "しゅ": sh('u'),
  316. "しょ": sh('o'),
  317. "ちゃ": ch('a'),
  318. "ちゅ": ch('u'),
  319. "ちょ": ch('o'),
  320. "にゃ": literal('nya', 0),
  321. "にゅ": literal('nyu', 0),
  322. "にょ": literal('nyo', 0),
  323. "ひゃ": literal('hya', 0),
  324. "ひゅ": literal('hyu', 0),
  325. "ひょ": literal('hyo', 0),
  326. "みゃ": literal('mya', 0),
  327. "みゅ": literal('myu', 0),
  328. "みょ": literal('myo', 0),
  329. "りゃ": literal('rya', 0),
  330. "りゅ": literal('ryu', 0),
  331. "りょ": literal('ryo', 0),
  332. "ぎゃ": literal('gya', 0),
  333. "ぎゅ": literal('gyu', 0),
  334. "ぎょ": literal('gyo', 0),
  335. "じゃ": j('a'),
  336. "じゅ": j('u'),
  337. "じょ": j('o'),
  338. "ぢゃ": literal('dya', 0),
  339. "ぢゅ": literal('dyu', 0),
  340. "ぢょ": literal('dyo', 0),
  341. "びゃ": literal('bya', 0),
  342. "びゅ": literal('byu', 0),
  343. "びょ": literal('byo', 0),
  344. "ぴゃ": literal('pya', 0),
  345. "ぴゅ": literal('pyu', 0),
  346. "ぴょ": literal('pyo', 0),
  347. "ふぁ": literal('fa', 0),
  348. "ふぃ": literal('fi', 0),
  349. "ふぇ": literal('fe', 0),
  350. "ふぉ": literal('fo', 0),
  351. "ゔぁ": literal('va', 0),
  352. "ゔぃ": literal('vi', 0),
  353. "ゔぇ": literal('ve', 0),
  354. "ゔぉ": literal('vo', 0)
  355. }
  356. const TRIPLE_KANA_MAPPING: KanaMapping = {};
  357. [
  358. "か", "き", "く", "け", "こ",
  359. "さ", "し", "す", "せ", "そ",
  360. "た", "ち", "つ", "て", "と",
  361. "は", "ひ", "ふ", "へ", "ほ",
  362. "が", "ぎ", "ぐ", "げ", "ご",
  363. "ざ", "じ", "ず", "ぜ", "ぞ",
  364. "だ", "ぢ", "づ", "で", "ど",
  365. "ば", "び", "ぶ", "べ", "ぼ",
  366. "ぱ", "ぴ", "ぷ", "ぺ", "ぽ",
  367. "ゔ"
  368. ].forEach(kana => {
  369. DOUBLE_KANA_MAPPING['っ' + kana] = smallTsu(SINGLE_KANA_MAPPING[kana]);
  370. });
  371. [
  372. "きゃ", "きゅ", "きょ",
  373. "しゃ", "しゅ", "しょ",
  374. "ちゃ", "ちゅ", "ちょ",
  375. "ぎゃ", "ぎゅ", "ぎょ",
  376. "じゃ", "じゅ", "じょ",
  377. "ぢゃ", "ぢゅ", "ぢょ",
  378. "びゃ", "びゅ", "びょ",
  379. "ぴゃ", "ぴゅ", "ぴょ",
  380. "ふぁ", "ふぃ", "ふぇ", "ふぉ",
  381. "ゔぁ", "ゔぃ", "ゔぇ", "ゔぉ"
  382. ].forEach(kana => {
  383. TRIPLE_KANA_MAPPING['っ' + kana] = smallTsu(DOUBLE_KANA_MAPPING[kana]);
  384. });
  385. /**
  386. * This normalizes input for matching. All alphabet is lower-cased, katakana
  387. * is transformed to hiragana. All whitespace is now just a space. We take
  388. * care to not change the length of the string as we have to match it
  389. * one-for-one so we can display the original source kana.
  390. */
  391. export function normalizeInput(input: string): string {
  392. return input.toLowerCase().split('').map(letter => {
  393. let transform = KATAKANA_MAPPING[letter];
  394. if (transform !== undefined) {
  395. return transform;
  396. } else if (/\s/.test(letter)) {
  397. return ' ';
  398. } else {
  399. return letter;
  400. }
  401. }).join('');
  402. }
  403. export class KanaInputState {
  404. kana: string[];
  405. stateMachines: StateMachine[];
  406. currentIndex: number;
  407. constructor(input: string) {
  408. let kana: string[] = [];
  409. let machines: StateMachine[] = [];
  410. let position = 0;
  411. let mappings = [
  412. SINGLE_KANA_MAPPING,
  413. DOUBLE_KANA_MAPPING,
  414. TRIPLE_KANA_MAPPING
  415. ]
  416. // we pad the input so checking 3 at a time is simpler
  417. let normalized = normalizeInput(input) + ' ';
  418. while (position < input.length) {
  419. // we check substrings of length 3, 2, then 1
  420. for (let i = 3; i > 0; --i) {
  421. let original = input.substr(position, i);
  422. let segment = normalized.substr(position, i);
  423. let machine = mappings[i - 1][segment];
  424. if (machine != undefined) {
  425. kana.push(original);
  426. let nextMachine = machine.clone();
  427. if (machines.length > 0) {
  428. let prevMachine = machines[machines.length - 1];
  429. prevMachine.nextMachine = nextMachine;
  430. }
  431. machines.push(nextMachine);
  432. position += i - 1;
  433. break;
  434. }
  435. }
  436. // even if we don't find a match, keep progressing
  437. // unmapped characters will be ignored
  438. position += 1;
  439. }
  440. this.kana = kana;
  441. this.stateMachines = machines;
  442. this.currentIndex = 0;
  443. }
  444. map<T>(func: (s: string, m: StateMachine) => T): T[] {
  445. let result: T[] = [];
  446. for (let i = 0; i < this.kana.length; ++i) {
  447. result.push(func(this.kana[i], this.stateMachines[i]));
  448. }
  449. return result;
  450. }
  451. handleInput(input: string): boolean {
  452. if (this.currentIndex >= this.stateMachines.length) return false;
  453. let currentMachine = this.stateMachines[this.currentIndex];
  454. currentMachine.transition(input);
  455. while (currentMachine.isFinished()) {
  456. this.currentIndex += 1;
  457. currentMachine = this.stateMachines[this.currentIndex];
  458. if (currentMachine == null) {
  459. return true;
  460. }
  461. }
  462. return this.currentIndex >= this.stateMachines.length;
  463. }
  464. getRemainingInput(): string {
  465. let remaining = '';
  466. for (let i = this.currentIndex; i < this.stateMachines.length; ++i) {
  467. remaining += this.stateMachines[i].getDisplay();
  468. }
  469. return remaining;
  470. }
  471. }