kana.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. /**
  2. * This module is mainly for handling romaji input to match the provided kana
  3. * input. While most kana map one-to-one with romaji, some kana have multiple
  4. * ways to be inputted. In addition, we also have to handle っ which causes the
  5. * next consonant to be repeated.
  6. *
  7. * The state management is done by having a state machine for each kana and it
  8. * should handle all possible variations of the romaji to be inputted.
  9. * Additionally, it also keeps track of what is left to be input, and adjusts
  10. * itself accordingly if an alternative romaji was used.
  11. *
  12. * One of the key considerations is handling っ. It doesn't have a spelling in
  13. * and of itself, but just modifies the state machine that will come after it.
  14. * Intermediate states need to be created and care should be given in what shows
  15. * up in the display.
  16. */
  17. import * as state from './state';
  18. import {
  19. State,
  20. StateMachine,
  21. makeTransition as t,
  22. mergeMachines,
  23. appendMachines,
  24. appendStates,
  25. } from './state';
  26. export const KANA_REGEX = /[ぁ-んァ-ン]/;
  27. function literal(source: string, ...extraBoundaries: number[]): StateMachine {
  28. let transitions: state.Transition[] = [];
  29. let meta = 0;
  30. for (let i = 0; i < source.length; ++i) {
  31. let from = source.substring(i);
  32. let input = source.charAt(i);
  33. let to = source.substring(i + 1);
  34. if (i === source.length - 1 || extraBoundaries.indexOf(i) >= 0) {
  35. meta += 1;
  36. }
  37. transitions.push(t(from, input, to, meta));
  38. }
  39. return state.buildFromTransitions(source, transitions);
  40. }
  41. function shi(): StateMachine {
  42. return state.buildFromTransitions('shi', [
  43. t('shi', 's', 'hi'),
  44. t('hi', 'h', 'i'),
  45. t('hi', 'i', '', 1),
  46. t('i', 'i', '', 1),
  47. ]);
  48. }
  49. function chi(): StateMachine {
  50. return state.buildFromTransitions('chi', [
  51. t('chi', 'c', 'hi'),
  52. t('chi', 't', 'i'),
  53. t('hi', 'h', 'i'),
  54. t('i', 'i', '', 1),
  55. ]);
  56. }
  57. function tsu(): StateMachine {
  58. return state.buildFromTransitions('tsu', [
  59. t('tsu', 't', 'su'),
  60. t('su', 's', 'u'),
  61. t('su', 'u', '', 1),
  62. t('u', 'u', '', 1),
  63. ]);
  64. }
  65. function fu(): StateMachine {
  66. return state.buildFromTransitions('fu', [
  67. t('fu', 'f', 'u'),
  68. t('fu', 'h', 'u'),
  69. t('u', 'u', '', 1),
  70. ]);
  71. }
  72. function f(vowel: StateMachine): StateMachine {
  73. const end = vowel.initialState.display;
  74. return mergeMachines(literal(`f${end}`, 0), appendMachines(fu(), vowel));
  75. }
  76. function v(vowel: StateMachine): StateMachine {
  77. const end = vowel.initialState.display;
  78. return mergeMachines(
  79. literal(`v${end}`, 0),
  80. appendMachines(literal('vu'), vowel)
  81. );
  82. }
  83. function y(base: StateMachine, vowel: StateMachine): StateMachine {
  84. const newState = base.initialState.transform((state) => {
  85. return [
  86. state.display.replace(/i$/, vowel.initialState.display),
  87. state.meta,
  88. ];
  89. });
  90. const newVowelState = vowel.initialState.transform((state) => {
  91. return [state.display, state.meta + 1];
  92. });
  93. for (const state of newState.closure()) {
  94. for (const key in state.transitions) {
  95. const nextState = state.transitions[key];
  96. if (nextState.display === vowel.initialState.display) {
  97. state.transitions[key] = newVowelState;
  98. }
  99. }
  100. }
  101. return mergeMachines(
  102. new StateMachine(newState),
  103. appendMachines(base, SMALL_KANA_MAPPING.get(vowel)!)
  104. );
  105. }
  106. function ji(): StateMachine {
  107. return state.buildFromTransitions('ji', [
  108. t('ji', 'j', 'i'),
  109. t('ji', 'z', 'i'),
  110. t('i', 'i', '', 1),
  111. ]);
  112. }
  113. function sh(vowel: StateMachine): StateMachine {
  114. const end = vowel.initialState.display.replace(/^y/, '');
  115. let source = 'sh' + end;
  116. let middle = 'h' + end;
  117. return mergeMachines(
  118. state.buildFromTransitions(source, [
  119. t(source, 's', middle, 1),
  120. t(middle, 'h', end),
  121. t(middle, 'y', end),
  122. t(end, end, '', 2),
  123. ]),
  124. appendMachines(shi(), SMALL_KANA_MAPPING.get(vowel)!)
  125. );
  126. }
  127. function ch(vowel: StateMachine): StateMachine {
  128. const end = vowel.initialState.display.replace(/^y/, '');
  129. let source = 'ch' + end;
  130. let middle = 'h' + end;
  131. let altMiddle = 'y' + end;
  132. return mergeMachines(
  133. state.buildFromTransitions(source, [
  134. t(source, 'c', middle),
  135. t(middle, 'h', end, 1),
  136. t(source, 't', altMiddle, 1),
  137. t(altMiddle, 'y', end),
  138. t(end, end, '', 2),
  139. ]),
  140. appendMachines(chi(), SMALL_KANA_MAPPING.get(vowel)!)
  141. );
  142. }
  143. function j(vowel: StateMachine): StateMachine {
  144. const end = vowel.initialState.display.replace(/^y/, '');
  145. return mergeMachines(literal(`j${end}`, 0), y(ji(), vowel));
  146. }
  147. function smallTsu(base: StateMachine): StateMachine {
  148. let { display, transitions } = base.initialState;
  149. const newState = new State(display.charAt(0) + display, 0);
  150. Object.keys(transitions).forEach((k) => {
  151. const nextState = transitions[k];
  152. const intermediateState = new State(k, 0);
  153. intermediateState.addTransition(k, new State('', 1));
  154. newState.addTransition(k, appendStates(intermediateState, nextState));
  155. });
  156. return mergeMachines(
  157. new StateMachine(newState),
  158. appendMachines(SMALL_KANA_MAPPING.get(KANA_MAPPING['つ'])!, base)
  159. );
  160. }
  161. function smallKana(base: StateMachine): StateMachine {
  162. let newState = base.initialState.clone();
  163. newState.addTransition('l', base.initialState);
  164. newState.addTransition('x', base.initialState);
  165. return new StateMachine(newState);
  166. }
  167. function n(base: StateMachine): StateMachine {
  168. const allowSingleN = ['n', 'a', 'i', 'u', 'e', 'o', 'y'].every((k) => {
  169. return base.initialState.transition(k) === undefined;
  170. });
  171. if (allowSingleN) {
  172. return mergeMachines(
  173. appendMachines(literal('n'), base),
  174. appendMachines(literal('nn'), base)
  175. );
  176. } else {
  177. throw new Error(
  178. `Invalid base ${base.initialState.display}, just defer to literal`
  179. );
  180. }
  181. }
  182. interface KanaMapping {
  183. [index: string]: StateMachine;
  184. }
  185. interface StringMapping {
  186. [index: string]: string;
  187. }
  188. const WHITESPACE = state.buildFromTransitions('_', [
  189. t('_', '_', ''),
  190. t('_', ' ', ''),
  191. ]);
  192. const KATAKANA_MAPPING: StringMapping = {
  193. ア: 'あ',
  194. イ: 'い',
  195. ウ: 'う',
  196. エ: 'え',
  197. オ: 'お',
  198. カ: 'か',
  199. キ: 'き',
  200. ク: 'く',
  201. ケ: 'け',
  202. コ: 'こ',
  203. サ: 'さ',
  204. シ: 'し',
  205. ス: 'す',
  206. セ: 'せ',
  207. ソ: 'そ',
  208. タ: 'た',
  209. チ: 'ち',
  210. ツ: 'つ',
  211. テ: 'て',
  212. ト: 'と',
  213. ナ: 'な',
  214. ニ: 'に',
  215. ヌ: 'ぬ',
  216. ネ: 'ね',
  217. ノ: 'の',
  218. ハ: 'は',
  219. ヒ: 'ひ',
  220. フ: 'ふ',
  221. ヘ: 'へ',
  222. ホ: 'ほ',
  223. マ: 'ま',
  224. ミ: 'み',
  225. ム: 'む',
  226. メ: 'め',
  227. モ: 'も',
  228. ヤ: 'や',
  229. ユ: 'ゆ',
  230. ヨ: 'よ',
  231. ラ: 'ら',
  232. リ: 'り',
  233. ル: 'る',
  234. レ: 'れ',
  235. ロ: 'ろ',
  236. ワ: 'わ',
  237. ヰ: 'ゐ',
  238. ヱ: 'ゑ',
  239. ヲ: 'を',
  240. ン: 'ん',
  241. ガ: 'が',
  242. ギ: 'ぎ',
  243. グ: 'ぐ',
  244. ゲ: 'げ',
  245. ゴ: 'ご',
  246. ザ: 'ざ',
  247. ジ: 'じ',
  248. ズ: 'ず',
  249. ゼ: 'ぜ',
  250. ゾ: 'ぞ',
  251. ダ: 'だ',
  252. ヂ: 'ぢ',
  253. ヅ: 'づ',
  254. デ: 'で',
  255. ド: 'ど',
  256. バ: 'ば',
  257. ビ: 'び',
  258. ブ: 'ぶ',
  259. ベ: 'べ',
  260. ボ: 'ぼ',
  261. パ: 'ぱ',
  262. ピ: 'ぴ',
  263. プ: 'ぷ',
  264. ペ: 'ぺ',
  265. ポ: 'ぽ',
  266. ヴ: 'ゔ',
  267. ァ: 'ぁ',
  268. ィ: 'ぃ',
  269. ゥ: 'ぅ',
  270. ェ: 'ぇ',
  271. ォ: 'ぉ',
  272. ャ: 'ゃ',
  273. ュ: 'ゅ',
  274. ョ: 'ょ',
  275. ッ: 'っ',
  276. };
  277. export const KANA_MAPPING: KanaMapping = {
  278. あ: literal('a'),
  279. い: literal('i'),
  280. う: literal('u'),
  281. え: literal('e'),
  282. お: literal('o'),
  283. か: literal('ka'),
  284. き: literal('ki'),
  285. く: literal('ku'),
  286. け: literal('ke'),
  287. こ: literal('ko'),
  288. さ: literal('sa'),
  289. し: shi(),
  290. す: literal('su'),
  291. せ: literal('se'),
  292. そ: literal('so'),
  293. た: literal('ta'),
  294. ち: chi(),
  295. つ: tsu(),
  296. て: literal('te'),
  297. と: literal('to'),
  298. な: literal('na'),
  299. に: literal('ni'),
  300. ぬ: literal('nu'),
  301. ね: literal('ne'),
  302. の: literal('no'),
  303. は: literal('ha'),
  304. ひ: literal('hi'),
  305. ふ: fu(),
  306. へ: literal('he'),
  307. ほ: literal('ho'),
  308. ま: literal('ma'),
  309. み: literal('mi'),
  310. む: literal('mu'),
  311. め: literal('me'),
  312. も: literal('mo'),
  313. や: literal('ya'),
  314. ゆ: literal('yu'),
  315. よ: literal('yo'),
  316. ら: literal('ra'),
  317. り: literal('ri'),
  318. る: literal('ru'),
  319. れ: literal('re'),
  320. ろ: literal('ro'),
  321. わ: literal('wa'),
  322. ゐ: literal('i'),
  323. ゑ: literal('e'),
  324. を: literal('wo'),
  325. ん: literal('nn'),
  326. が: literal('ga'),
  327. ぎ: literal('gi'),
  328. ぐ: literal('gu'),
  329. げ: literal('ge'),
  330. ご: literal('go'),
  331. ざ: literal('za'),
  332. じ: ji(),
  333. ず: literal('zu'),
  334. ぜ: literal('ze'),
  335. ぞ: literal('zo'),
  336. だ: literal('da'),
  337. ぢ: literal('di'),
  338. づ: literal('du'),
  339. で: literal('de'),
  340. ど: literal('do'),
  341. ば: literal('ba'),
  342. び: literal('bi'),
  343. ぶ: literal('bu'),
  344. べ: literal('be'),
  345. ぼ: literal('bo'),
  346. ぱ: literal('pa'),
  347. ぴ: literal('pi'),
  348. ぷ: literal('pu'),
  349. ぺ: literal('pe'),
  350. ぽ: literal('po'),
  351. ゔ: literal('vu'),
  352. ー: literal('-'),
  353. ' ': WHITESPACE,
  354. };
  355. 'abcdefghijklmnopqrstuvwxyz'.split('').forEach((letter) => {
  356. KANA_MAPPING[letter] = literal(letter);
  357. });
  358. const SMALL_KANA_MAPPING: Map<StateMachine, StateMachine> = new Map();
  359. [
  360. ['ぁ', 'あ'],
  361. ['ぃ', 'い'],
  362. ['ぅ', 'う'],
  363. ['ぇ', 'え'],
  364. ['ぉ', 'お'],
  365. ['ヵ', 'か'],
  366. ['っ', 'つ'],
  367. ['ゃ', 'や'],
  368. ['ゅ', 'ゆ'],
  369. ['ょ', 'よ'],
  370. ].forEach((pair) => {
  371. let [small, big] = pair;
  372. KANA_MAPPING[small] = smallKana(KANA_MAPPING[big]);
  373. SMALL_KANA_MAPPING.set(KANA_MAPPING[big], KANA_MAPPING[small]);
  374. });
  375. Object.assign(KANA_MAPPING, {
  376. きゃ: y(KANA_MAPPING['き'], KANA_MAPPING['や']),
  377. きゅ: y(KANA_MAPPING['き'], KANA_MAPPING['ゆ']),
  378. きょ: y(KANA_MAPPING['き'], KANA_MAPPING['よ']),
  379. しゃ: sh(KANA_MAPPING['や']),
  380. しゅ: sh(KANA_MAPPING['ゆ']),
  381. しょ: sh(KANA_MAPPING['よ']),
  382. ちゃ: ch(KANA_MAPPING['や']),
  383. ちゅ: ch(KANA_MAPPING['ゆ']),
  384. ちょ: ch(KANA_MAPPING['よ']),
  385. にゃ: y(KANA_MAPPING['に'], KANA_MAPPING['や']),
  386. にゅ: y(KANA_MAPPING['に'], KANA_MAPPING['ゆ']),
  387. にょ: y(KANA_MAPPING['に'], KANA_MAPPING['よ']),
  388. ひゃ: y(KANA_MAPPING['ひ'], KANA_MAPPING['や']),
  389. ひゅ: y(KANA_MAPPING['ひ'], KANA_MAPPING['ゆ']),
  390. ひょ: y(KANA_MAPPING['ひ'], KANA_MAPPING['よ']),
  391. みゃ: y(KANA_MAPPING['み'], KANA_MAPPING['や']),
  392. みゅ: y(KANA_MAPPING['み'], KANA_MAPPING['ゆ']),
  393. みょ: y(KANA_MAPPING['み'], KANA_MAPPING['よ']),
  394. りゃ: y(KANA_MAPPING['り'], KANA_MAPPING['や']),
  395. りゅ: y(KANA_MAPPING['り'], KANA_MAPPING['ゆ']),
  396. りょ: y(KANA_MAPPING['り'], KANA_MAPPING['よ']),
  397. ぎゃ: y(KANA_MAPPING['ぎ'], KANA_MAPPING['や']),
  398. ぎゅ: y(KANA_MAPPING['ぎ'], KANA_MAPPING['ゆ']),
  399. ぎょ: y(KANA_MAPPING['ぎ'], KANA_MAPPING['よ']),
  400. じゃ: j(KANA_MAPPING['や']),
  401. じゅ: j(KANA_MAPPING['ゆ']),
  402. じょ: j(KANA_MAPPING['よ']),
  403. ぢゃ: y(KANA_MAPPING['ぢ'], KANA_MAPPING['や']),
  404. ぢゅ: y(KANA_MAPPING['ぢ'], KANA_MAPPING['ゆ']),
  405. ぢょ: y(KANA_MAPPING['ぢ'], KANA_MAPPING['よ']),
  406. びゃ: y(KANA_MAPPING['び'], KANA_MAPPING['や']),
  407. びゅ: y(KANA_MAPPING['び'], KANA_MAPPING['ゆ']),
  408. びょ: y(KANA_MAPPING['び'], KANA_MAPPING['よ']),
  409. ぴゃ: y(KANA_MAPPING['ぴ'], KANA_MAPPING['や']),
  410. ぴゅ: y(KANA_MAPPING['ぴ'], KANA_MAPPING['ゆ']),
  411. ぴょ: y(KANA_MAPPING['ぴ'], KANA_MAPPING['よ']),
  412. ふぁ: f(KANA_MAPPING['ぁ']),
  413. ふぃ: f(KANA_MAPPING['ぃ']),
  414. ふぇ: f(KANA_MAPPING['ぇ']),
  415. ふぉ: f(KANA_MAPPING['ぉ']),
  416. ゔぁ: v(KANA_MAPPING['ぁ']),
  417. ゔぃ: v(KANA_MAPPING['ぃ']),
  418. ゔぇ: v(KANA_MAPPING['ぇ']),
  419. ゔぉ: v(KANA_MAPPING['ぉ']),
  420. });
  421. [
  422. 'か',
  423. 'き',
  424. 'く',
  425. 'け',
  426. 'こ',
  427. 'さ',
  428. 'し',
  429. 'す',
  430. 'せ',
  431. 'そ',
  432. 'た',
  433. 'ち',
  434. 'つ',
  435. 'て',
  436. 'と',
  437. 'は',
  438. 'ひ',
  439. 'ふ',
  440. 'へ',
  441. 'ほ',
  442. 'が',
  443. 'ぎ',
  444. 'ぐ',
  445. 'げ',
  446. 'ご',
  447. 'ざ',
  448. 'じ',
  449. 'ず',
  450. 'ぜ',
  451. 'ぞ',
  452. 'だ',
  453. 'ぢ',
  454. 'づ',
  455. 'で',
  456. 'ど',
  457. 'ば',
  458. 'び',
  459. 'ぶ',
  460. 'べ',
  461. 'ぼ',
  462. 'ぱ',
  463. 'ぴ',
  464. 'ぷ',
  465. 'ぺ',
  466. 'ぽ',
  467. 'ゔ',
  468. ].forEach((kana) => {
  469. KANA_MAPPING['っ' + kana] = smallTsu(KANA_MAPPING[kana]);
  470. KANA_MAPPING['ん' + kana] = n(KANA_MAPPING[kana]);
  471. });
  472. [
  473. 'きゃ',
  474. 'きゅ',
  475. 'きょ',
  476. 'しゃ',
  477. 'しゅ',
  478. 'しょ',
  479. 'ちゃ',
  480. 'ちゅ',
  481. 'ちょ',
  482. 'ぎゃ',
  483. 'ぎゅ',
  484. 'ぎょ',
  485. 'じゃ',
  486. 'じゅ',
  487. 'じょ',
  488. 'ぢゃ',
  489. 'ぢゅ',
  490. 'ぢょ',
  491. 'びゃ',
  492. 'びゅ',
  493. 'びょ',
  494. 'ぴゃ',
  495. 'ぴゅ',
  496. 'ぴょ',
  497. 'ふぁ',
  498. 'ふぃ',
  499. 'ふぇ',
  500. 'ふぉ',
  501. 'ゔぁ',
  502. 'ゔぃ',
  503. 'ゔぇ',
  504. 'ゔぉ',
  505. ].forEach((kana) => {
  506. KANA_MAPPING['っ' + kana] = smallTsu(KANA_MAPPING[kana]);
  507. KANA_MAPPING['ん' + kana] = n(KANA_MAPPING[kana]);
  508. });
  509. /**
  510. * This normalizes input for matching. All alphabet is lower-cased, katakana
  511. * is transformed to hiragana. All whitespace is now just a space. We take
  512. * care to not change the length of the string as we have to match it
  513. * one-for-one so we can display the original source kana.
  514. */
  515. export function normalizeInput(input: string): string {
  516. return input
  517. .toLowerCase()
  518. .split('')
  519. .map((letter) => {
  520. let transform = KATAKANA_MAPPING[letter];
  521. if (transform !== undefined) {
  522. return transform;
  523. } else if (/\s/.test(letter)) {
  524. return ' ';
  525. } else {
  526. return letter;
  527. }
  528. })
  529. .join('');
  530. }
  531. export class KanaInputState {
  532. kana: string[];
  533. stateMachines: StateMachine[];
  534. currentIndex: number;
  535. constructor(input: string) {
  536. let kana: string[] = [];
  537. let machines: StateMachine[] = [];
  538. let position = 0;
  539. // we pad the input so checking 3 at a time is simpler
  540. let normalized = normalizeInput(input) + ' ';
  541. while (position < input.length) {
  542. // we check substrings of length 3, 2, then 1
  543. for (let i = 3; i > 0; --i) {
  544. let original = input.substr(position, i);
  545. let segment = normalized.substr(position, i);
  546. let machine = KANA_MAPPING[segment];
  547. if (machine != undefined) {
  548. kana.push(original);
  549. let nextMachine = machine.clone();
  550. if (machines.length > 0) {
  551. let prevMachine = machines[machines.length - 1];
  552. prevMachine.nextMachine = nextMachine;
  553. }
  554. machines.push(nextMachine);
  555. position += i - 1;
  556. break;
  557. }
  558. }
  559. // even if we don't find a match, keep progressing
  560. // unmapped characters will be ignored
  561. position += 1;
  562. }
  563. this.kana = kana;
  564. this.stateMachines = machines;
  565. this.currentIndex = 0;
  566. }
  567. map<T>(func: (s: string, m: StateMachine) => T): T[] {
  568. let result: T[] = [];
  569. for (let i = 0; i < this.kana.length; ++i) {
  570. result.push(func(this.kana[i], this.stateMachines[i]));
  571. }
  572. return result;
  573. }
  574. handleInput(input: string): boolean {
  575. if (this.currentIndex >= this.stateMachines.length) return false;
  576. let currentMachine = this.stateMachines[this.currentIndex];
  577. currentMachine.transition(input);
  578. while (currentMachine.isFinished()) {
  579. this.currentIndex += 1;
  580. currentMachine = this.stateMachines[this.currentIndex];
  581. if (currentMachine == null) {
  582. return true;
  583. }
  584. }
  585. return this.currentIndex >= this.stateMachines.length;
  586. }
  587. isFinished(): boolean {
  588. return this.currentIndex >= this.stateMachines.length;
  589. }
  590. getRemainingInput(): string {
  591. let remaining = '';
  592. for (let i = this.currentIndex; i < this.stateMachines.length; ++i) {
  593. remaining += this.stateMachines[i].getDisplay();
  594. }
  595. return remaining;
  596. }
  597. }