kana.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. /**
  2. * This module is mainly for handling romaji input to match the provided kana
  3. * input. While most kana map one-to-one with romaji, some kana have multiple
  4. * ways to be inputted. In addition, we also have to handle っ which causes the
  5. * next consonant to be repeated.
  6. *
  7. * The state management is done by having a state machine for each kana and it
  8. * should handle all possible variations of the romaji to be inputted.
  9. * Additionally, it also keeps track of what is left to be input, and adjusts
  10. * itself accordingly if an alternative romaji was used.
  11. *
  12. * One of the key considerations is handling っ. It doesn't have a spelling in
  13. * and of itself, but just modifies the state machine that will come after it.
  14. * Intermediate states need to be created and care should be given in what shows
  15. * up in the display.
  16. */
  17. import * as state from './state';
  18. import {
  19. State,
  20. StateMachine,
  21. makeTransition as t,
  22. mergeMachines,
  23. appendMachines,
  24. appendStates,
  25. } from './state';
  26. function literal(source: string, ...extraBoundaries: number[]): StateMachine {
  27. let transitions: state.Transition[] = [];
  28. let meta = 0;
  29. for (let i = 0; i < source.length; ++i) {
  30. let from = source.substring(i);
  31. let input = source.charAt(i);
  32. let to = source.substring(i + 1);
  33. if (i === source.length - 1 || extraBoundaries.indexOf(i) >= 0) {
  34. meta += 1;
  35. }
  36. transitions.push(t(from, input, to, meta));
  37. }
  38. return state.buildFromTransitions(source, transitions);
  39. }
  40. function shi(): StateMachine {
  41. return state.buildFromTransitions('shi', [
  42. t('shi', 's', 'hi'),
  43. t('hi', 'h', 'i'),
  44. t('hi', 'i', '', 1),
  45. t('i', 'i', '', 1),
  46. ]);
  47. }
  48. function chi(): StateMachine {
  49. return state.buildFromTransitions('chi', [
  50. t('chi', 'c', 'hi'),
  51. t('chi', 't', 'i'),
  52. t('hi', 'h', 'i'),
  53. t('i', 'i', '', 1),
  54. ]);
  55. }
  56. function tsu(): StateMachine {
  57. return state.buildFromTransitions('tsu', [
  58. t('tsu', 't', 'su'),
  59. t('su', 's', 'u'),
  60. t('su', 'u', '', 1),
  61. t('u', 'u', '', 1),
  62. ]);
  63. }
  64. function fu(): StateMachine {
  65. return state.buildFromTransitions('fu', [
  66. t('fu', 'f', 'u'),
  67. t('fu', 'h', 'u'),
  68. t('u', 'u', '', 1),
  69. ]);
  70. }
  71. function f(vowel: string): StateMachine {
  72. return mergeMachines(
  73. literal(`f${vowel}`, 0),
  74. appendMachines(fu(), smallKana(literal(vowel)))
  75. );
  76. }
  77. function v(vowel: string): StateMachine {
  78. return mergeMachines(
  79. literal(`v${vowel}`, 0),
  80. appendMachines(literal('vu'), smallKana(literal(vowel)))
  81. );
  82. }
  83. function ji(): StateMachine {
  84. return state.buildFromTransitions('ji', [
  85. t('ji', 'j', 'i'),
  86. t('ji', 'z', 'i'),
  87. t('i', 'i', '', 1),
  88. ]);
  89. }
  90. function sh(end: string): StateMachine {
  91. let source = 'sh' + end;
  92. let middle = 'h' + end;
  93. return state.buildFromTransitions(source, [
  94. t(source, 's', middle, 1),
  95. t(middle, 'h', end),
  96. t(middle, 'y', end),
  97. t(end, end, '', 2),
  98. ]);
  99. }
  100. function ch(end: string): StateMachine {
  101. let source = 'ch' + end;
  102. let middle = 'h' + end;
  103. let altMiddle = 'y' + end;
  104. return state.buildFromTransitions(source, [
  105. t(source, 'c', middle),
  106. t(middle, 'h', end, 1),
  107. t(source, 't', altMiddle, 1),
  108. t(altMiddle, 'y', end),
  109. t(end, end, '', 2),
  110. ]);
  111. }
  112. function j(end: string): StateMachine {
  113. return mergeMachines(
  114. literal(`j${end}`, 0),
  115. literal(`jy${end}`, 0),
  116. literal(`zy${end}`, 0)
  117. );
  118. }
  119. function smallTsu(base: StateMachine): StateMachine {
  120. let { display, transitions } = base.initialState;
  121. const newState = new State(display.charAt(0) + display, 0);
  122. Object.keys(transitions).forEach((k) => {
  123. const nextState = transitions[k];
  124. const intermediateState = new State(k, 0);
  125. intermediateState.addTransition(k, new State('', 1));
  126. newState.addTransition(k, appendStates(intermediateState, nextState));
  127. });
  128. return mergeMachines(
  129. new StateMachine(newState),
  130. appendMachines(smallKana(tsu()), base)
  131. );
  132. }
  133. function smallKana(base: StateMachine): StateMachine {
  134. let newState = base.initialState.clone();
  135. newState.addTransition('l', base.initialState);
  136. newState.addTransition('x', base.initialState);
  137. return new StateMachine(newState);
  138. }
  139. function n(base: StateMachine): StateMachine {
  140. const allowSingleN = ['n', 'a', 'i', 'u', 'e', 'o', 'y'].every((k) => {
  141. return base.initialState.transition(k) === undefined;
  142. });
  143. if (allowSingleN) {
  144. return mergeMachines(
  145. appendMachines(literal('n'), base),
  146. appendMachines(literal('nn'), base)
  147. );
  148. } else {
  149. throw new Error(
  150. `Invalid base ${base.initialState.display}, just defer to literal`
  151. );
  152. }
  153. }
  154. interface KanaMapping {
  155. [index: string]: StateMachine;
  156. }
  157. interface StringMapping {
  158. [index: string]: string;
  159. }
  160. const WHITESPACE = state.buildFromTransitions('_', [
  161. t('_', '_', ''),
  162. t('_', ' ', ''),
  163. ]);
  164. const KATAKANA_MAPPING: StringMapping = {
  165. ア: 'あ',
  166. イ: 'い',
  167. ウ: 'う',
  168. エ: 'え',
  169. オ: 'お',
  170. カ: 'か',
  171. キ: 'き',
  172. ク: 'く',
  173. ケ: 'け',
  174. コ: 'こ',
  175. サ: 'さ',
  176. シ: 'し',
  177. ス: 'す',
  178. セ: 'せ',
  179. ソ: 'そ',
  180. タ: 'た',
  181. チ: 'ち',
  182. ツ: 'つ',
  183. テ: 'て',
  184. ト: 'と',
  185. ナ: 'な',
  186. ニ: 'に',
  187. ヌ: 'ぬ',
  188. ネ: 'ね',
  189. ノ: 'の',
  190. ハ: 'は',
  191. ヒ: 'ひ',
  192. フ: 'ふ',
  193. ヘ: 'へ',
  194. ホ: 'ほ',
  195. マ: 'ま',
  196. ミ: 'み',
  197. ム: 'む',
  198. メ: 'め',
  199. モ: 'も',
  200. ヤ: 'や',
  201. ユ: 'ゆ',
  202. ヨ: 'よ',
  203. ラ: 'ら',
  204. リ: 'り',
  205. ル: 'る',
  206. レ: 'れ',
  207. ロ: 'ろ',
  208. ワ: 'わ',
  209. ヰ: 'ゐ',
  210. ヱ: 'ゑ',
  211. ヲ: 'を',
  212. ン: 'ん',
  213. ガ: 'が',
  214. ギ: 'ぎ',
  215. グ: 'ぐ',
  216. ゲ: 'げ',
  217. ゴ: 'ご',
  218. ザ: 'ざ',
  219. ジ: 'じ',
  220. ズ: 'ず',
  221. ゼ: 'ぜ',
  222. ゾ: 'ぞ',
  223. ダ: 'だ',
  224. ヂ: 'ぢ',
  225. ヅ: 'づ',
  226. デ: 'で',
  227. ド: 'ど',
  228. バ: 'ば',
  229. ビ: 'び',
  230. ブ: 'ぶ',
  231. ベ: 'べ',
  232. ボ: 'ぼ',
  233. パ: 'ぱ',
  234. ピ: 'ぴ',
  235. プ: 'ぷ',
  236. ペ: 'ぺ',
  237. ポ: 'ぽ',
  238. ヴ: 'ゔ',
  239. ァ: 'ぁ',
  240. ィ: 'ぃ',
  241. ゥ: 'ぅ',
  242. ェ: 'ぇ',
  243. ォ: 'ぉ',
  244. ャ: 'ゃ',
  245. ュ: 'ゅ',
  246. ョ: 'ょ',
  247. ッ: 'っ',
  248. };
  249. const SINGLE_KANA_MAPPING: KanaMapping = {
  250. あ: literal('a'),
  251. い: literal('i'),
  252. う: literal('u'),
  253. え: literal('e'),
  254. お: literal('o'),
  255. か: literal('ka'),
  256. き: literal('ki'),
  257. く: literal('ku'),
  258. け: literal('ke'),
  259. こ: literal('ko'),
  260. さ: literal('sa'),
  261. し: shi(),
  262. す: literal('su'),
  263. せ: literal('se'),
  264. そ: literal('so'),
  265. た: literal('ta'),
  266. ち: chi(),
  267. つ: tsu(),
  268. て: literal('te'),
  269. と: literal('to'),
  270. な: literal('na'),
  271. に: literal('ni'),
  272. ぬ: literal('nu'),
  273. ね: literal('ne'),
  274. の: literal('no'),
  275. は: literal('ha'),
  276. ひ: literal('hi'),
  277. ふ: fu(),
  278. へ: literal('he'),
  279. ほ: literal('ho'),
  280. ま: literal('ma'),
  281. み: literal('mi'),
  282. む: literal('mu'),
  283. め: literal('me'),
  284. も: literal('mo'),
  285. や: literal('ya'),
  286. ゆ: literal('yu'),
  287. よ: literal('yo'),
  288. ら: literal('ra'),
  289. り: literal('ri'),
  290. る: literal('ru'),
  291. れ: literal('re'),
  292. ろ: literal('ro'),
  293. わ: literal('wa'),
  294. ゐ: literal('i'),
  295. ゑ: literal('e'),
  296. を: literal('wo'),
  297. ん: literal('nn'),
  298. が: literal('ga'),
  299. ぎ: literal('gi'),
  300. ぐ: literal('gu'),
  301. げ: literal('ge'),
  302. ご: literal('go'),
  303. ざ: literal('za'),
  304. じ: ji(),
  305. ず: literal('zu'),
  306. ぜ: literal('ze'),
  307. ぞ: literal('zo'),
  308. だ: literal('da'),
  309. ぢ: literal('di'),
  310. づ: literal('du'),
  311. で: literal('de'),
  312. ど: literal('do'),
  313. ば: literal('ba'),
  314. び: literal('bi'),
  315. ぶ: literal('bu'),
  316. べ: literal('be'),
  317. ぼ: literal('bo'),
  318. ぱ: literal('pa'),
  319. ぴ: literal('pi'),
  320. ぷ: literal('pu'),
  321. ぺ: literal('pe'),
  322. ぽ: literal('po'),
  323. ゔ: literal('vu'),
  324. ー: literal('-'),
  325. ' ': WHITESPACE,
  326. };
  327. 'abcdefghijklmnopqrstuvwxyz'.split('').forEach((letter) => {
  328. SINGLE_KANA_MAPPING[letter] = literal(letter);
  329. });
  330. [
  331. ['ぁ', 'あ'],
  332. ['ぃ', 'い'],
  333. ['ぅ', 'う'],
  334. ['ぇ', 'え'],
  335. ['ぉ', 'お'],
  336. ['ヵ', 'か'],
  337. ].forEach((pair) => {
  338. let [small, big] = pair;
  339. SINGLE_KANA_MAPPING[small] = smallKana(SINGLE_KANA_MAPPING[big]);
  340. });
  341. const DOUBLE_KANA_MAPPING: KanaMapping = {
  342. きゃ: literal('kya', 0),
  343. きゅ: literal('kyu', 0),
  344. きょ: literal('kyo', 0),
  345. しゃ: sh('a'),
  346. しゅ: sh('u'),
  347. しょ: sh('o'),
  348. ちゃ: ch('a'),
  349. ちゅ: ch('u'),
  350. ちょ: ch('o'),
  351. にゃ: literal('nya', 0),
  352. にゅ: literal('nyu', 0),
  353. にょ: literal('nyo', 0),
  354. ひゃ: literal('hya', 0),
  355. ひゅ: literal('hyu', 0),
  356. ひょ: literal('hyo', 0),
  357. みゃ: literal('mya', 0),
  358. みゅ: literal('myu', 0),
  359. みょ: literal('myo', 0),
  360. りゃ: literal('rya', 0),
  361. りゅ: literal('ryu', 0),
  362. りょ: literal('ryo', 0),
  363. ぎゃ: literal('gya', 0),
  364. ぎゅ: literal('gyu', 0),
  365. ぎょ: literal('gyo', 0),
  366. じゃ: j('a'),
  367. じゅ: j('u'),
  368. じょ: j('o'),
  369. ぢゃ: literal('dya', 0),
  370. ぢゅ: literal('dyu', 0),
  371. ぢょ: literal('dyo', 0),
  372. びゃ: literal('bya', 0),
  373. びゅ: literal('byu', 0),
  374. びょ: literal('byo', 0),
  375. ぴゃ: literal('pya', 0),
  376. ぴゅ: literal('pyu', 0),
  377. ぴょ: literal('pyo', 0),
  378. ふぁ: f('a'),
  379. ふぃ: f('i'),
  380. ふぇ: f('e'),
  381. ふぉ: f('o'),
  382. ゔぁ: v('a'),
  383. ゔぃ: v('i'),
  384. ゔぇ: v('e'),
  385. ゔぉ: v('o'),
  386. };
  387. const TRIPLE_KANA_MAPPING: KanaMapping = {};
  388. [
  389. 'か',
  390. 'き',
  391. 'く',
  392. 'け',
  393. 'こ',
  394. 'さ',
  395. 'し',
  396. 'す',
  397. 'せ',
  398. 'そ',
  399. 'た',
  400. 'ち',
  401. 'つ',
  402. 'て',
  403. 'と',
  404. 'は',
  405. 'ひ',
  406. 'ふ',
  407. 'へ',
  408. 'ほ',
  409. 'が',
  410. 'ぎ',
  411. 'ぐ',
  412. 'げ',
  413. 'ご',
  414. 'ざ',
  415. 'じ',
  416. 'ず',
  417. 'ぜ',
  418. 'ぞ',
  419. 'だ',
  420. 'ぢ',
  421. 'づ',
  422. 'で',
  423. 'ど',
  424. 'ば',
  425. 'び',
  426. 'ぶ',
  427. 'べ',
  428. 'ぼ',
  429. 'ぱ',
  430. 'ぴ',
  431. 'ぷ',
  432. 'ぺ',
  433. 'ぽ',
  434. 'ゔ',
  435. ].forEach((kana) => {
  436. DOUBLE_KANA_MAPPING['っ' + kana] = smallTsu(SINGLE_KANA_MAPPING[kana]);
  437. DOUBLE_KANA_MAPPING['ん' + kana] = n(SINGLE_KANA_MAPPING[kana]);
  438. });
  439. [
  440. 'きゃ',
  441. 'きゅ',
  442. 'きょ',
  443. 'しゃ',
  444. 'しゅ',
  445. 'しょ',
  446. 'ちゃ',
  447. 'ちゅ',
  448. 'ちょ',
  449. 'ぎゃ',
  450. 'ぎゅ',
  451. 'ぎょ',
  452. 'じゃ',
  453. 'じゅ',
  454. 'じょ',
  455. 'ぢゃ',
  456. 'ぢゅ',
  457. 'ぢょ',
  458. 'びゃ',
  459. 'びゅ',
  460. 'びょ',
  461. 'ぴゃ',
  462. 'ぴゅ',
  463. 'ぴょ',
  464. 'ふぁ',
  465. 'ふぃ',
  466. 'ふぇ',
  467. 'ふぉ',
  468. 'ゔぁ',
  469. 'ゔぃ',
  470. 'ゔぇ',
  471. 'ゔぉ',
  472. ].forEach((kana) => {
  473. TRIPLE_KANA_MAPPING['っ' + kana] = smallTsu(DOUBLE_KANA_MAPPING[kana]);
  474. TRIPLE_KANA_MAPPING['ん' + kana] = n(DOUBLE_KANA_MAPPING[kana]);
  475. });
  476. /**
  477. * This normalizes input for matching. All alphabet is lower-cased, katakana
  478. * is transformed to hiragana. All whitespace is now just a space. We take
  479. * care to not change the length of the string as we have to match it
  480. * one-for-one so we can display the original source kana.
  481. */
  482. export function normalizeInput(input: string): string {
  483. return input
  484. .toLowerCase()
  485. .split('')
  486. .map((letter) => {
  487. let transform = KATAKANA_MAPPING[letter];
  488. if (transform !== undefined) {
  489. return transform;
  490. } else if (/\s/.test(letter)) {
  491. return ' ';
  492. } else {
  493. return letter;
  494. }
  495. })
  496. .join('');
  497. }
  498. export class KanaInputState {
  499. kana: string[];
  500. stateMachines: StateMachine[];
  501. currentIndex: number;
  502. constructor(input: string) {
  503. let kana: string[] = [];
  504. let machines: StateMachine[] = [];
  505. let position = 0;
  506. let mappings = [
  507. SINGLE_KANA_MAPPING,
  508. DOUBLE_KANA_MAPPING,
  509. TRIPLE_KANA_MAPPING,
  510. ];
  511. // we pad the input so checking 3 at a time is simpler
  512. let normalized = normalizeInput(input) + ' ';
  513. while (position < input.length) {
  514. // we check substrings of length 3, 2, then 1
  515. for (let i = 3; i > 0; --i) {
  516. let original = input.substr(position, i);
  517. let segment = normalized.substr(position, i);
  518. let machine = mappings[i - 1][segment];
  519. if (machine != undefined) {
  520. kana.push(original);
  521. let nextMachine = machine.clone();
  522. if (machines.length > 0) {
  523. let prevMachine = machines[machines.length - 1];
  524. prevMachine.nextMachine = nextMachine;
  525. }
  526. machines.push(nextMachine);
  527. position += i - 1;
  528. break;
  529. }
  530. }
  531. // even if we don't find a match, keep progressing
  532. // unmapped characters will be ignored
  533. position += 1;
  534. }
  535. this.kana = kana;
  536. this.stateMachines = machines;
  537. this.currentIndex = 0;
  538. }
  539. map<T>(func: (s: string, m: StateMachine) => T): T[] {
  540. let result: T[] = [];
  541. for (let i = 0; i < this.kana.length; ++i) {
  542. result.push(func(this.kana[i], this.stateMachines[i]));
  543. }
  544. return result;
  545. }
  546. handleInput(input: string): boolean {
  547. if (this.currentIndex >= this.stateMachines.length) return false;
  548. let currentMachine = this.stateMachines[this.currentIndex];
  549. currentMachine.transition(input);
  550. while (currentMachine.isFinished()) {
  551. this.currentIndex += 1;
  552. currentMachine = this.stateMachines[this.currentIndex];
  553. if (currentMachine == null) {
  554. return true;
  555. }
  556. }
  557. return this.currentIndex >= this.stateMachines.length;
  558. }
  559. isFinished(): boolean {
  560. return this.currentIndex >= this.stateMachines.length;
  561. }
  562. getRemainingInput(): string {
  563. let remaining = '';
  564. for (let i = this.currentIndex; i < this.stateMachines.length; ++i) {
  565. remaining += this.stateMachines[i].getDisplay();
  566. }
  567. return remaining;
  568. }
  569. }