123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565 |
- /**
- * This module is mainly for handling romaji input to match the provided kana
- * input. While most kana map one-to-one with romaji, some kana have multiple
- * ways to be inputted. In addition, we also have to handle っ which causes the
- * next consonant to be repeated.
- *
- * The state management is done by having a state machine for each kana and it
- * should handle all possible variations of the romaji to be inputted.
- * Additionally, it also keeps track of what is left to be input, and adjusts
- * itself accordingly if an alternative romaji was used.
- *
- * One of the key considerations is handling っ. It doesn't have a spelling in
- * and of itself, but just modifies the state machine that will come after it.
- * Intermediate states need to be created and care should be given in what shows
- * up in the display.
- */
- import * as state from './state';
- import { State, StateMachine, makeTransition as t } from './state';
- function literal(source: string, ...extraBoundaries: number[]): StateMachine {
- let transitions: state.Transition[] = [];
- for (let i = 0; i < source.length; ++i) {
- let from = source.substring(i);
- let input = source.charAt(i);
- let to = source.substring(i + 1);
- let boundary = i === source.length - 1 || extraBoundaries.indexOf(i) >= 0;
- transitions.push(t(from, input, to, boundary));
- }
- return state.buildFromTransitions(source, transitions);
- }
- function shi(): StateMachine {
- return state.buildFromTransitions('shi', [
- t('shi', 's', 'hi'),
- t('hi', 'h', 'i'),
- t('hi', 'i', '', true),
- t('i', 'i', '', true),
- ]);
- }
- function chi(): StateMachine {
- return state.buildFromTransitions('chi', [
- t('chi', 'c', 'hi'),
- t('chi', 't', 'i'),
- t('hi', 'h', 'i'),
- t('i', 'i', '', true),
- ]);
- }
- function tsu(): StateMachine {
- return state.buildFromTransitions('tsu', [
- t('tsu', 't', 'su'),
- t('su', 's', 'u'),
- t('su', 'u', '', true),
- t('u', 'u', '', true),
- ]);
- }
- function fu(): StateMachine {
- return state.buildFromTransitions('fu', [
- t('fu', 'f', 'u'),
- t('fu', 'h', 'u'),
- t('u', 'u', '', true),
- ]);
- }
- function ji(): StateMachine {
- return state.buildFromTransitions('ji', [
- t('ji', 'j', 'i'),
- t('ji', 'z', 'i'),
- t('i', 'i', '', true),
- ]);
- }
- function sh(end: string): StateMachine {
- let source = 'sh' + end;
- let middle = 'h' + end;
- return state.buildFromTransitions(source, [
- t(source, 's', middle, true),
- t(middle, 'h', end),
- t(middle, 'y', end),
- t(end, end, '', true),
- ]);
- }
- function ch(end: string): StateMachine {
- let source = 'ch' + end;
- let middle = 'h' + end;
- let altMiddle = 'y' + end;
- return state.buildFromTransitions(source, [
- t(source, 'c', middle),
- t(middle, 'h', end, true),
- t(source, 't', altMiddle, true),
- t(altMiddle, 'y', end),
- t(end, end, '', true),
- ]);
- }
- function j(end: string): StateMachine {
- let source = 'j' + end;
- let altMiddle = 'y' + end;
- return state.buildFromTransitions(source, [
- t(source, 'j', end, true),
- t(source, 'z', altMiddle),
- t(end, 'y', end),
- t(altMiddle, 'y', end, true),
- t(end, end, '', true),
- ]);
- }
- function smallTsu(base: StateMachine): StateMachine {
- let { display, transitions } = base.initialState;
- let newState = new State(display.charAt(0) + display);
- Object.keys(transitions).forEach((k) => {
- let [nextState, _] = transitions[k];
- let intermediateDisplay = k + nextState.display;
- let intermediateState = new State(intermediateDisplay);
- intermediateState.addTransition(k, nextState);
- newState.addTransition(k, intermediateState, true);
- });
- return new StateMachine(newState, base.finalState);
- }
- function smallKana(base: StateMachine): StateMachine {
- let newState = base.initialState.clone();
- newState.addTransition('l', base.initialState);
- newState.addTransition('x', base.initialState);
- return new StateMachine(newState, base.finalState);
- }
- interface KanaMapping {
- [index: string]: StateMachine;
- }
- interface StringMapping {
- [index: string]: string;
- }
- const WHITESPACE = state.buildFromTransitions('_', [
- t('_', '_', ''),
- t('_', ' ', ''),
- ]);
- const KATAKANA_MAPPING: StringMapping = {
- ア: 'あ',
- イ: 'い',
- ウ: 'う',
- エ: 'え',
- オ: 'お',
- カ: 'か',
- キ: 'き',
- ク: 'く',
- ケ: 'け',
- コ: 'こ',
- サ: 'さ',
- シ: 'し',
- ス: 'す',
- セ: 'せ',
- ソ: 'そ',
- タ: 'た',
- チ: 'ち',
- ツ: 'つ',
- テ: 'て',
- ト: 'と',
- ナ: 'な',
- ニ: 'に',
- ヌ: 'ぬ',
- ネ: 'ね',
- ノ: 'の',
- ハ: 'は',
- ヒ: 'ひ',
- フ: 'ふ',
- ヘ: 'へ',
- ホ: 'ほ',
- マ: 'ま',
- ミ: 'み',
- ム: 'む',
- メ: 'め',
- モ: 'も',
- ヤ: 'や',
- ユ: 'ゆ',
- ヨ: 'よ',
- ラ: 'ら',
- リ: 'り',
- ル: 'る',
- レ: 'れ',
- ロ: 'ろ',
- ワ: 'わ',
- ヰ: 'ゐ',
- ヱ: 'ゑ',
- ヲ: 'を',
- ン: 'ん',
- ガ: 'が',
- ギ: 'ぎ',
- グ: 'ぐ',
- ゲ: 'げ',
- ゴ: 'ご',
- ザ: 'ざ',
- ジ: 'じ',
- ズ: 'ず',
- ゼ: 'ぜ',
- ゾ: 'ぞ',
- ダ: 'だ',
- ヂ: 'ぢ',
- ヅ: 'づ',
- デ: 'で',
- ド: 'ど',
- バ: 'ば',
- ビ: 'び',
- ブ: 'ぶ',
- ベ: 'べ',
- ボ: 'ぼ',
- パ: 'ぱ',
- ピ: 'ぴ',
- プ: 'ぷ',
- ペ: 'ぺ',
- ポ: 'ぽ',
- ヴ: 'ゔ',
- ァ: 'ぁ',
- ィ: 'ぃ',
- ゥ: 'ぅ',
- ェ: 'ぇ',
- ォ: 'ぉ',
- ャ: 'ゃ',
- ュ: 'ゅ',
- ョ: 'ょ',
- ッ: 'っ',
- };
- const SINGLE_KANA_MAPPING: KanaMapping = {
- あ: literal('a'),
- い: literal('i'),
- う: literal('u'),
- え: literal('e'),
- お: literal('o'),
- か: literal('ka'),
- き: literal('ki'),
- く: literal('ku'),
- け: literal('ke'),
- こ: literal('ko'),
- さ: literal('sa'),
- し: shi(),
- す: literal('su'),
- せ: literal('se'),
- そ: literal('so'),
- た: literal('ta'),
- ち: chi(),
- つ: tsu(),
- て: literal('te'),
- と: literal('to'),
- な: literal('na'),
- に: literal('ni'),
- ぬ: literal('nu'),
- ね: literal('ne'),
- の: literal('no'),
- は: literal('ha'),
- ひ: literal('hi'),
- ふ: fu(),
- へ: literal('he'),
- ほ: literal('ho'),
- ま: literal('ma'),
- み: literal('mi'),
- む: literal('mu'),
- め: literal('me'),
- も: literal('mo'),
- や: literal('ya'),
- ゆ: literal('yu'),
- よ: literal('yo'),
- ら: literal('ra'),
- り: literal('ri'),
- る: literal('ru'),
- れ: literal('re'),
- ろ: literal('ro'),
- わ: literal('wa'),
- ゐ: literal('i'),
- ゑ: literal('e'),
- を: literal('wo'),
- ん: literal('n'),
- が: literal('ga'),
- ぎ: literal('gi'),
- ぐ: literal('gu'),
- げ: literal('ge'),
- ご: literal('go'),
- ざ: literal('za'),
- じ: ji(),
- ず: literal('zu'),
- ぜ: literal('ze'),
- ぞ: literal('zo'),
- だ: literal('da'),
- ぢ: literal('di'),
- づ: literal('du'),
- で: literal('de'),
- ど: literal('do'),
- ば: literal('ba'),
- び: literal('bi'),
- ぶ: literal('bu'),
- べ: literal('be'),
- ぼ: literal('bo'),
- ぱ: literal('pa'),
- ぴ: literal('pi'),
- ぷ: literal('pu'),
- ぺ: literal('pe'),
- ぽ: literal('po'),
- ゔ: literal('vu'),
- ー: literal('-'),
- ' ': WHITESPACE,
- };
- 'abcdefghijklmnopqrstuvwxyz'.split('').forEach((letter) => {
- SINGLE_KANA_MAPPING[letter] = literal(letter);
- });
- [
- ['ぁ', 'あ'],
- ['ぃ', 'い'],
- ['ぅ', 'う'],
- ['ぇ', 'え'],
- ['ぉ', 'お'],
- ['ヵ', 'か'],
- ].forEach((pair) => {
- let [small, big] = pair;
- SINGLE_KANA_MAPPING[small] = smallKana(SINGLE_KANA_MAPPING[big]);
- });
- const DOUBLE_KANA_MAPPING: KanaMapping = {
- きゃ: literal('kya', 0),
- きゅ: literal('kyu', 0),
- きょ: literal('kyo', 0),
- しゃ: sh('a'),
- しゅ: sh('u'),
- しょ: sh('o'),
- ちゃ: ch('a'),
- ちゅ: ch('u'),
- ちょ: ch('o'),
- にゃ: literal('nya', 0),
- にゅ: literal('nyu', 0),
- にょ: literal('nyo', 0),
- ひゃ: literal('hya', 0),
- ひゅ: literal('hyu', 0),
- ひょ: literal('hyo', 0),
- みゃ: literal('mya', 0),
- みゅ: literal('myu', 0),
- みょ: literal('myo', 0),
- りゃ: literal('rya', 0),
- りゅ: literal('ryu', 0),
- りょ: literal('ryo', 0),
- ぎゃ: literal('gya', 0),
- ぎゅ: literal('gyu', 0),
- ぎょ: literal('gyo', 0),
- じゃ: j('a'),
- じゅ: j('u'),
- じょ: j('o'),
- ぢゃ: literal('dya', 0),
- ぢゅ: literal('dyu', 0),
- ぢょ: literal('dyo', 0),
- びゃ: literal('bya', 0),
- びゅ: literal('byu', 0),
- びょ: literal('byo', 0),
- ぴゃ: literal('pya', 0),
- ぴゅ: literal('pyu', 0),
- ぴょ: literal('pyo', 0),
- ふぁ: literal('fa', 0),
- ふぃ: literal('fi', 0),
- ふぇ: literal('fe', 0),
- ふぉ: literal('fo', 0),
- ゔぁ: literal('va', 0),
- ゔぃ: literal('vi', 0),
- ゔぇ: literal('ve', 0),
- ゔぉ: literal('vo', 0),
- };
- const TRIPLE_KANA_MAPPING: KanaMapping = {};
- [
- 'か',
- 'き',
- 'く',
- 'け',
- 'こ',
- 'さ',
- 'し',
- 'す',
- 'せ',
- 'そ',
- 'た',
- 'ち',
- 'つ',
- 'て',
- 'と',
- 'は',
- 'ひ',
- 'ふ',
- 'へ',
- 'ほ',
- 'が',
- 'ぎ',
- 'ぐ',
- 'げ',
- 'ご',
- 'ざ',
- 'じ',
- 'ず',
- 'ぜ',
- 'ぞ',
- 'だ',
- 'ぢ',
- 'づ',
- 'で',
- 'ど',
- 'ば',
- 'び',
- 'ぶ',
- 'べ',
- 'ぼ',
- 'ぱ',
- 'ぴ',
- 'ぷ',
- 'ぺ',
- 'ぽ',
- 'ゔ',
- ].forEach((kana) => {
- DOUBLE_KANA_MAPPING['っ' + kana] = smallTsu(SINGLE_KANA_MAPPING[kana]);
- });
- [
- 'きゃ',
- 'きゅ',
- 'きょ',
- 'しゃ',
- 'しゅ',
- 'しょ',
- 'ちゃ',
- 'ちゅ',
- 'ちょ',
- 'ぎゃ',
- 'ぎゅ',
- 'ぎょ',
- 'じゃ',
- 'じゅ',
- 'じょ',
- 'ぢゃ',
- 'ぢゅ',
- 'ぢょ',
- 'びゃ',
- 'びゅ',
- 'びょ',
- 'ぴゃ',
- 'ぴゅ',
- 'ぴょ',
- 'ふぁ',
- 'ふぃ',
- 'ふぇ',
- 'ふぉ',
- 'ゔぁ',
- 'ゔぃ',
- 'ゔぇ',
- 'ゔぉ',
- ].forEach((kana) => {
- TRIPLE_KANA_MAPPING['っ' + kana] = smallTsu(DOUBLE_KANA_MAPPING[kana]);
- });
- /**
- * This normalizes input for matching. All alphabet is lower-cased, katakana
- * is transformed to hiragana. All whitespace is now just a space. We take
- * care to not change the length of the string as we have to match it
- * one-for-one so we can display the original source kana.
- */
- export function normalizeInput(input: string): string {
- return input
- .toLowerCase()
- .split('')
- .map((letter) => {
- let transform = KATAKANA_MAPPING[letter];
- if (transform !== undefined) {
- return transform;
- } else if (/\s/.test(letter)) {
- return ' ';
- } else {
- return letter;
- }
- })
- .join('');
- }
- export class KanaInputState {
- kana: string[];
- stateMachines: StateMachine[];
- currentIndex: number;
- constructor(input: string) {
- let kana: string[] = [];
- let machines: StateMachine[] = [];
- let position = 0;
- let mappings = [
- SINGLE_KANA_MAPPING,
- DOUBLE_KANA_MAPPING,
- TRIPLE_KANA_MAPPING,
- ];
- // we pad the input so checking 3 at a time is simpler
- let normalized = normalizeInput(input) + ' ';
- while (position < input.length) {
- // we check substrings of length 3, 2, then 1
- for (let i = 3; i > 0; --i) {
- let original = input.substr(position, i);
- let segment = normalized.substr(position, i);
- let machine = mappings[i - 1][segment];
- if (machine != undefined) {
- kana.push(original);
- let nextMachine = machine.clone();
- if (machines.length > 0) {
- let prevMachine = machines[machines.length - 1];
- prevMachine.nextMachine = nextMachine;
- }
- machines.push(nextMachine);
- position += i - 1;
- break;
- }
- }
- // even if we don't find a match, keep progressing
- // unmapped characters will be ignored
- position += 1;
- }
- this.kana = kana;
- this.stateMachines = machines;
- this.currentIndex = 0;
- }
- map<T>(func: (s: string, m: StateMachine) => T): T[] {
- let result: T[] = [];
- for (let i = 0; i < this.kana.length; ++i) {
- result.push(func(this.kana[i], this.stateMachines[i]));
- }
- return result;
- }
- handleInput(input: string): boolean {
- if (this.currentIndex >= this.stateMachines.length) return false;
- let currentMachine = this.stateMachines[this.currentIndex];
- currentMachine.transition(input);
- while (currentMachine.isFinished()) {
- this.currentIndex += 1;
- currentMachine = this.stateMachines[this.currentIndex];
- if (currentMachine == null) {
- return true;
- }
- }
- return this.currentIndex >= this.stateMachines.length;
- }
- getRemainingInput(): string {
- let remaining = '';
- for (let i = this.currentIndex; i < this.stateMachines.length; ++i) {
- remaining += this.stateMachines[i].getDisplay();
- }
- return remaining;
- }
- }
|