kana.ts 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. /**
  2. * This module is mainly for handling romaji input to match the provided kana
  3. * input. While most kana map one-to-one with romaji, some kana have multiple
  4. * ways to be inputted. In addition, we also have to handle っ which causes the
  5. * next consonant to be repeated.
  6. *
  7. * The state management is done by having a state machine for each kana and it
  8. * should handle all possible variations of the romaji to be inputted.
  9. * Additionally, it also keeps track of what is left to be input, and adjusts
  10. * itself accordingly if an alternative romaji was used.
  11. *
  12. * One of the key considerations is handling っ. It doesn't have a spelling in
  13. * and of itself, but just modifies the state machine that will come after it.
  14. * Intermediate states need to be created and care should be given in what shows
  15. * up in the display.
  16. */
  17. import * as state from './state';
  18. import { State, StateMachine, makeTransition as t } from './state';
  19. function literal(source: string, ...extraBoundaries: number[]): StateMachine {
  20. let transitions: state.Transition[] = [];
  21. for (let i = 0; i < source.length; ++i) {
  22. let from = source.substring(i);
  23. let input = source.charAt(i);
  24. let to = source.substring(i + 1);
  25. let boundary = i === source.length - 1 || extraBoundaries.indexOf(i) >= 0;
  26. transitions.push(t(from, input, to, boundary));
  27. }
  28. return state.buildFromTransitions(source, transitions);
  29. }
  30. function shi(): StateMachine {
  31. return state.buildFromTransitions('shi', [
  32. t('shi', 's', 'hi'),
  33. t('hi', 'h', 'i'),
  34. t('hi', 'i', '', true),
  35. t('i', 'i', '', true),
  36. ]);
  37. }
  38. function chi(): StateMachine {
  39. return state.buildFromTransitions('chi', [
  40. t('chi', 'c', 'hi'),
  41. t('chi', 't', 'i'),
  42. t('hi', 'h', 'i'),
  43. t('i', 'i', '', true),
  44. ]);
  45. }
  46. function tsu(): StateMachine {
  47. return state.buildFromTransitions('tsu', [
  48. t('tsu', 't', 'su'),
  49. t('su', 's', 'u'),
  50. t('su', 'u', '', true),
  51. t('u', 'u', '', true),
  52. ]);
  53. }
  54. function fu(): StateMachine {
  55. return state.buildFromTransitions('fu', [
  56. t('fu', 'f', 'u'),
  57. t('fu', 'h', 'u'),
  58. t('u', 'u', '', true),
  59. ]);
  60. }
  61. function ji(): StateMachine {
  62. return state.buildFromTransitions('ji', [
  63. t('ji', 'j', 'i'),
  64. t('ji', 'z', 'i'),
  65. t('i', 'i', '', true),
  66. ]);
  67. }
  68. function sh(end: string): StateMachine {
  69. let source = 'sh' + end;
  70. let middle = 'h' + end;
  71. return state.buildFromTransitions(source, [
  72. t(source, 's', middle, true),
  73. t(middle, 'h', end),
  74. t(middle, 'y', end),
  75. t(end, end, '', true),
  76. ]);
  77. }
  78. function ch(end: string): StateMachine {
  79. let source = 'ch' + end;
  80. let middle = 'h' + end;
  81. let altMiddle = 'y' + end;
  82. return state.buildFromTransitions(source, [
  83. t(source, 'c', middle),
  84. t(middle, 'h', end, true),
  85. t(source, 't', altMiddle, true),
  86. t(altMiddle, 'y', end),
  87. t(end, end, '', true),
  88. ]);
  89. }
  90. function j(end: string): StateMachine {
  91. let source = 'j' + end;
  92. let altMiddle = 'y' + end;
  93. return state.buildFromTransitions(source, [
  94. t(source, 'j', end, true),
  95. t(source, 'z', altMiddle),
  96. t(end, 'y', end),
  97. t(altMiddle, 'y', end, true),
  98. t(end, end, '', true),
  99. ]);
  100. }
  101. function smallTsu(base: StateMachine): StateMachine {
  102. let { display, transitions } = base.initialState;
  103. let newState = new State(display.charAt(0) + display);
  104. Object.keys(transitions).forEach((k) => {
  105. let [nextState, _] = transitions[k];
  106. let intermediateDisplay = k + nextState.display;
  107. let intermediateState = new State(intermediateDisplay);
  108. intermediateState.addTransition(k, nextState);
  109. newState.addTransition(k, intermediateState, true);
  110. });
  111. return new StateMachine(newState, base.finalState);
  112. }
  113. function smallKana(base: StateMachine): StateMachine {
  114. let newState = base.initialState.clone();
  115. newState.addTransition('l', base.initialState);
  116. newState.addTransition('x', base.initialState);
  117. return new StateMachine(newState, base.finalState);
  118. }
  119. interface KanaMapping {
  120. [index: string]: StateMachine;
  121. }
  122. interface StringMapping {
  123. [index: string]: string;
  124. }
  125. const WHITESPACE = state.buildFromTransitions('_', [
  126. t('_', '_', ''),
  127. t('_', ' ', ''),
  128. ]);
  129. const KATAKANA_MAPPING: StringMapping = {
  130. ア: 'あ',
  131. イ: 'い',
  132. ウ: 'う',
  133. エ: 'え',
  134. オ: 'お',
  135. カ: 'か',
  136. キ: 'き',
  137. ク: 'く',
  138. ケ: 'け',
  139. コ: 'こ',
  140. サ: 'さ',
  141. シ: 'し',
  142. ス: 'す',
  143. セ: 'せ',
  144. ソ: 'そ',
  145. タ: 'た',
  146. チ: 'ち',
  147. ツ: 'つ',
  148. テ: 'て',
  149. ト: 'と',
  150. ナ: 'な',
  151. ニ: 'に',
  152. ヌ: 'ぬ',
  153. ネ: 'ね',
  154. ノ: 'の',
  155. ハ: 'は',
  156. ヒ: 'ひ',
  157. フ: 'ふ',
  158. ヘ: 'へ',
  159. ホ: 'ほ',
  160. マ: 'ま',
  161. ミ: 'み',
  162. ム: 'む',
  163. メ: 'め',
  164. モ: 'も',
  165. ヤ: 'や',
  166. ユ: 'ゆ',
  167. ヨ: 'よ',
  168. ラ: 'ら',
  169. リ: 'り',
  170. ル: 'る',
  171. レ: 'れ',
  172. ロ: 'ろ',
  173. ワ: 'わ',
  174. ヰ: 'ゐ',
  175. ヱ: 'ゑ',
  176. ヲ: 'を',
  177. ン: 'ん',
  178. ガ: 'が',
  179. ギ: 'ぎ',
  180. グ: 'ぐ',
  181. ゲ: 'げ',
  182. ゴ: 'ご',
  183. ザ: 'ざ',
  184. ジ: 'じ',
  185. ズ: 'ず',
  186. ゼ: 'ぜ',
  187. ゾ: 'ぞ',
  188. ダ: 'だ',
  189. ヂ: 'ぢ',
  190. ヅ: 'づ',
  191. デ: 'で',
  192. ド: 'ど',
  193. バ: 'ば',
  194. ビ: 'び',
  195. ブ: 'ぶ',
  196. ベ: 'べ',
  197. ボ: 'ぼ',
  198. パ: 'ぱ',
  199. ピ: 'ぴ',
  200. プ: 'ぷ',
  201. ペ: 'ぺ',
  202. ポ: 'ぽ',
  203. ヴ: 'ゔ',
  204. ァ: 'ぁ',
  205. ィ: 'ぃ',
  206. ゥ: 'ぅ',
  207. ェ: 'ぇ',
  208. ォ: 'ぉ',
  209. ャ: 'ゃ',
  210. ュ: 'ゅ',
  211. ョ: 'ょ',
  212. ッ: 'っ',
  213. };
  214. const SINGLE_KANA_MAPPING: KanaMapping = {
  215. あ: literal('a'),
  216. い: literal('i'),
  217. う: literal('u'),
  218. え: literal('e'),
  219. お: literal('o'),
  220. か: literal('ka'),
  221. き: literal('ki'),
  222. く: literal('ku'),
  223. け: literal('ke'),
  224. こ: literal('ko'),
  225. さ: literal('sa'),
  226. し: shi(),
  227. す: literal('su'),
  228. せ: literal('se'),
  229. そ: literal('so'),
  230. た: literal('ta'),
  231. ち: chi(),
  232. つ: tsu(),
  233. て: literal('te'),
  234. と: literal('to'),
  235. な: literal('na'),
  236. に: literal('ni'),
  237. ぬ: literal('nu'),
  238. ね: literal('ne'),
  239. の: literal('no'),
  240. は: literal('ha'),
  241. ひ: literal('hi'),
  242. ふ: fu(),
  243. へ: literal('he'),
  244. ほ: literal('ho'),
  245. ま: literal('ma'),
  246. み: literal('mi'),
  247. む: literal('mu'),
  248. め: literal('me'),
  249. も: literal('mo'),
  250. や: literal('ya'),
  251. ゆ: literal('yu'),
  252. よ: literal('yo'),
  253. ら: literal('ra'),
  254. り: literal('ri'),
  255. る: literal('ru'),
  256. れ: literal('re'),
  257. ろ: literal('ro'),
  258. わ: literal('wa'),
  259. ゐ: literal('i'),
  260. ゑ: literal('e'),
  261. を: literal('wo'),
  262. ん: literal('n'),
  263. が: literal('ga'),
  264. ぎ: literal('gi'),
  265. ぐ: literal('gu'),
  266. げ: literal('ge'),
  267. ご: literal('go'),
  268. ざ: literal('za'),
  269. じ: ji(),
  270. ず: literal('zu'),
  271. ぜ: literal('ze'),
  272. ぞ: literal('zo'),
  273. だ: literal('da'),
  274. ぢ: literal('di'),
  275. づ: literal('du'),
  276. で: literal('de'),
  277. ど: literal('do'),
  278. ば: literal('ba'),
  279. び: literal('bi'),
  280. ぶ: literal('bu'),
  281. べ: literal('be'),
  282. ぼ: literal('bo'),
  283. ぱ: literal('pa'),
  284. ぴ: literal('pi'),
  285. ぷ: literal('pu'),
  286. ぺ: literal('pe'),
  287. ぽ: literal('po'),
  288. ゔ: literal('vu'),
  289. ー: literal('-'),
  290. ' ': WHITESPACE,
  291. };
  292. 'abcdefghijklmnopqrstuvwxyz'.split('').forEach((letter) => {
  293. SINGLE_KANA_MAPPING[letter] = literal(letter);
  294. });
  295. [
  296. ['ぁ', 'あ'],
  297. ['ぃ', 'い'],
  298. ['ぅ', 'う'],
  299. ['ぇ', 'え'],
  300. ['ぉ', 'お'],
  301. ['ヵ', 'か'],
  302. ].forEach((pair) => {
  303. let [small, big] = pair;
  304. SINGLE_KANA_MAPPING[small] = smallKana(SINGLE_KANA_MAPPING[big]);
  305. });
  306. const DOUBLE_KANA_MAPPING: KanaMapping = {
  307. きゃ: literal('kya', 0),
  308. きゅ: literal('kyu', 0),
  309. きょ: literal('kyo', 0),
  310. しゃ: sh('a'),
  311. しゅ: sh('u'),
  312. しょ: sh('o'),
  313. ちゃ: ch('a'),
  314. ちゅ: ch('u'),
  315. ちょ: ch('o'),
  316. にゃ: literal('nya', 0),
  317. にゅ: literal('nyu', 0),
  318. にょ: literal('nyo', 0),
  319. ひゃ: literal('hya', 0),
  320. ひゅ: literal('hyu', 0),
  321. ひょ: literal('hyo', 0),
  322. みゃ: literal('mya', 0),
  323. みゅ: literal('myu', 0),
  324. みょ: literal('myo', 0),
  325. りゃ: literal('rya', 0),
  326. りゅ: literal('ryu', 0),
  327. りょ: literal('ryo', 0),
  328. ぎゃ: literal('gya', 0),
  329. ぎゅ: literal('gyu', 0),
  330. ぎょ: literal('gyo', 0),
  331. じゃ: j('a'),
  332. じゅ: j('u'),
  333. じょ: j('o'),
  334. ぢゃ: literal('dya', 0),
  335. ぢゅ: literal('dyu', 0),
  336. ぢょ: literal('dyo', 0),
  337. びゃ: literal('bya', 0),
  338. びゅ: literal('byu', 0),
  339. びょ: literal('byo', 0),
  340. ぴゃ: literal('pya', 0),
  341. ぴゅ: literal('pyu', 0),
  342. ぴょ: literal('pyo', 0),
  343. ふぁ: literal('fa', 0),
  344. ふぃ: literal('fi', 0),
  345. ふぇ: literal('fe', 0),
  346. ふぉ: literal('fo', 0),
  347. ゔぁ: literal('va', 0),
  348. ゔぃ: literal('vi', 0),
  349. ゔぇ: literal('ve', 0),
  350. ゔぉ: literal('vo', 0),
  351. };
  352. const TRIPLE_KANA_MAPPING: KanaMapping = {};
  353. [
  354. 'か',
  355. 'き',
  356. 'く',
  357. 'け',
  358. 'こ',
  359. 'さ',
  360. 'し',
  361. 'す',
  362. 'せ',
  363. 'そ',
  364. 'た',
  365. 'ち',
  366. 'つ',
  367. 'て',
  368. 'と',
  369. 'は',
  370. 'ひ',
  371. 'ふ',
  372. 'へ',
  373. 'ほ',
  374. 'が',
  375. 'ぎ',
  376. 'ぐ',
  377. 'げ',
  378. 'ご',
  379. 'ざ',
  380. 'じ',
  381. 'ず',
  382. 'ぜ',
  383. 'ぞ',
  384. 'だ',
  385. 'ぢ',
  386. 'づ',
  387. 'で',
  388. 'ど',
  389. 'ば',
  390. 'び',
  391. 'ぶ',
  392. 'べ',
  393. 'ぼ',
  394. 'ぱ',
  395. 'ぴ',
  396. 'ぷ',
  397. 'ぺ',
  398. 'ぽ',
  399. 'ゔ',
  400. ].forEach((kana) => {
  401. DOUBLE_KANA_MAPPING['っ' + kana] = smallTsu(SINGLE_KANA_MAPPING[kana]);
  402. });
  403. [
  404. 'きゃ',
  405. 'きゅ',
  406. 'きょ',
  407. 'しゃ',
  408. 'しゅ',
  409. 'しょ',
  410. 'ちゃ',
  411. 'ちゅ',
  412. 'ちょ',
  413. 'ぎゃ',
  414. 'ぎゅ',
  415. 'ぎょ',
  416. 'じゃ',
  417. 'じゅ',
  418. 'じょ',
  419. 'ぢゃ',
  420. 'ぢゅ',
  421. 'ぢょ',
  422. 'びゃ',
  423. 'びゅ',
  424. 'びょ',
  425. 'ぴゃ',
  426. 'ぴゅ',
  427. 'ぴょ',
  428. 'ふぁ',
  429. 'ふぃ',
  430. 'ふぇ',
  431. 'ふぉ',
  432. 'ゔぁ',
  433. 'ゔぃ',
  434. 'ゔぇ',
  435. 'ゔぉ',
  436. ].forEach((kana) => {
  437. TRIPLE_KANA_MAPPING['っ' + kana] = smallTsu(DOUBLE_KANA_MAPPING[kana]);
  438. });
  439. /**
  440. * This normalizes input for matching. All alphabet is lower-cased, katakana
  441. * is transformed to hiragana. All whitespace is now just a space. We take
  442. * care to not change the length of the string as we have to match it
  443. * one-for-one so we can display the original source kana.
  444. */
  445. export function normalizeInput(input: string): string {
  446. return input
  447. .toLowerCase()
  448. .split('')
  449. .map((letter) => {
  450. let transform = KATAKANA_MAPPING[letter];
  451. if (transform !== undefined) {
  452. return transform;
  453. } else if (/\s/.test(letter)) {
  454. return ' ';
  455. } else {
  456. return letter;
  457. }
  458. })
  459. .join('');
  460. }
  461. export class KanaInputState {
  462. kana: string[];
  463. stateMachines: StateMachine[];
  464. currentIndex: number;
  465. constructor(input: string) {
  466. let kana: string[] = [];
  467. let machines: StateMachine[] = [];
  468. let position = 0;
  469. let mappings = [
  470. SINGLE_KANA_MAPPING,
  471. DOUBLE_KANA_MAPPING,
  472. TRIPLE_KANA_MAPPING,
  473. ];
  474. // we pad the input so checking 3 at a time is simpler
  475. let normalized = normalizeInput(input) + ' ';
  476. while (position < input.length) {
  477. // we check substrings of length 3, 2, then 1
  478. for (let i = 3; i > 0; --i) {
  479. let original = input.substr(position, i);
  480. let segment = normalized.substr(position, i);
  481. let machine = mappings[i - 1][segment];
  482. if (machine != undefined) {
  483. kana.push(original);
  484. let nextMachine = machine.clone();
  485. if (machines.length > 0) {
  486. let prevMachine = machines[machines.length - 1];
  487. prevMachine.nextMachine = nextMachine;
  488. }
  489. machines.push(nextMachine);
  490. position += i - 1;
  491. break;
  492. }
  493. }
  494. // even if we don't find a match, keep progressing
  495. // unmapped characters will be ignored
  496. position += 1;
  497. }
  498. this.kana = kana;
  499. this.stateMachines = machines;
  500. this.currentIndex = 0;
  501. }
  502. map<T>(func: (s: string, m: StateMachine) => T): T[] {
  503. let result: T[] = [];
  504. for (let i = 0; i < this.kana.length; ++i) {
  505. result.push(func(this.kana[i], this.stateMachines[i]));
  506. }
  507. return result;
  508. }
  509. handleInput(input: string): boolean {
  510. if (this.currentIndex >= this.stateMachines.length) return false;
  511. let currentMachine = this.stateMachines[this.currentIndex];
  512. currentMachine.transition(input);
  513. while (currentMachine.isFinished()) {
  514. this.currentIndex += 1;
  515. currentMachine = this.stateMachines[this.currentIndex];
  516. if (currentMachine == null) {
  517. return true;
  518. }
  519. }
  520. return this.currentIndex >= this.stateMachines.length;
  521. }
  522. getRemainingInput(): string {
  523. let remaining = '';
  524. for (let i = this.currentIndex; i < this.stateMachines.length; ++i) {
  525. remaining += this.stateMachines[i].getDisplay();
  526. }
  527. return remaining;
  528. }
  529. }