/** * This module is mainly for handling romaji input to match the provided kana * input. While most kana map one-to-one with romaji, some kana have multiple * ways to be inputted. In addition, we also have to handle っ which causes the * next consonant to be repeated. * * The state management is done by having a state machine for each kana and it * should handle all possible variations of the romaji to be inputted. * Additionally, it also keeps track of what is left to be input, and adjusts * itself accordingly if an alternative romaji was used. * * One of the key considerations is handling っ. It doesn't have a spelling in * and of itself, but just modifies the state machine that will come after it. * Intermediate states need to be created and care should be given in what shows * up in the display. */ /// namespace kana { import State = state.State; import StateMachine = state.StateMachine; import TransitionResult = state.TransitionResult; import t = state.makeTransition; function literal(source: string, ...extraBoundaries: number[]): StateMachine { let transitions: state.Transition[] = []; for (let i = 0; i < source.length; ++i) { let from = source.substring(i); let input = source.charAt(i); let to = source.substring(i+1); let boundary = i === (source.length - 1) || extraBoundaries.indexOf(i) >= 0; transitions.push(t(from, input, to, boundary)); } return state.buildFromTransitions(source, transitions); } function shi(): StateMachine { return state.buildFromTransitions('shi', [ t('shi', 's', 'hi'), t('hi', 'h', 'i'), t('hi', 'i', '', true), t('i', 'i', '', true) ]); } function chi(): StateMachine { return state.buildFromTransitions('chi', [ t('chi', 'c', 'hi'), t('chi', 't', 'i'), t('hi', 'h', 'i'), t('i', 'i', '', true) ]); } function tsu(): StateMachine { return state.buildFromTransitions('tsu', [ t('tsu', 't', 'su'), t('su', 's', 'u'), t('su', 'u', '', true), t('u', 'u', '', true) ]); } function fu(): StateMachine { return state.buildFromTransitions('fu', [ t('fu', 'f', 'u'), t('fu', 'h', 'u'), t('u', 'u', '', true) ]); } function ji(): StateMachine { return state.buildFromTransitions('ji', [ t('ji', 'j', 'i'), t('ji', 'z', 'i'), t('i', 'i', '', true) ]); } function sh(end: string): StateMachine { let source = 'sh' + end; let middle = 'h' + end; return state.buildFromTransitions(source, [ t(source, 's', middle, true), t(middle, 'h', end), t(middle, 'y', end), t(end, end, '', true) ]); } function ch(end: string): StateMachine { let source = 'ch' + end; let middle = 'h' + end; let altMiddle = 'y' + end; return state.buildFromTransitions(source, [ t(source, 'c', middle), t(middle, 'h', end, true), t(source, 't', altMiddle, true), t(altMiddle, 'y', end), t(end, end, '', true) ]); } function j(end: string): StateMachine { let source = 'j' + end; let altMiddle = 'y' + end; return state.buildFromTransitions(source, [ t(source, 'j', end, true), t(source, 'z', altMiddle), t(end, 'y', end), t(altMiddle, 'y', end, true), t(end, end, '', true) ]); } function smallTsu(base: StateMachine): StateMachine { let { display, transitions } = base.initialState; let newState = new State(display.charAt(0) + display); Object.keys(transitions).forEach(k => { let [nextState, _] = transitions[k]; let intermediateDisplay = k + nextState.display; let intermediateState = new State(intermediateDisplay); intermediateState.addTransition(k, nextState); newState.addTransition(k, intermediateState, true); }) return new StateMachine(newState, base.finalState); } function smallKana(base: StateMachine): StateMachine { let newState = base.initialState.clone(); newState.addTransition('l', base.initialState); newState.addTransition('x', base.initialState); return new StateMachine(newState, base.finalState); } interface KanaMapping { [index: string]: StateMachine } interface StringMapping { [index: string]: string } const WHITESPACE = state.buildFromTransitions('_', [ t('_', '_', ''), t('_', ' ', '') ]); const KATAKANA_MAPPING: StringMapping = { "ア": "あ", "イ": "い", "ウ": "う", "エ": "え", "オ": "お", "カ": "か", "キ": "き", "ク": "く", "ケ": "け", "コ": "こ", "サ": "さ", "シ": "し", "ス": "す", "セ": "せ", "ソ": "そ", "タ": "た", "チ": "ち", "ツ": "つ", "テ": "て", "ト": "と", "ナ": "な", "ニ": "に", "ヌ": "ぬ", "ネ": "ね", "ノ": "の", "ハ": "は", "ヒ": "ひ", "フ": "ふ", "ヘ": "へ", "ホ": "ほ", "マ": "ま", "ミ": "み", "ム": "む", "メ": "め", "モ": "も", "ヤ": "や", "ユ": "ゆ", "ヨ": "よ", "ラ": "ら", "リ": "り", "ル": "る", "レ": "れ", "ロ": "ろ", "ワ": "わ", "ヰ": "ゐ", "ヱ": "ゑ", "ヲ": "を", "ン": "ん", "ガ": "が", "ギ": "ぎ", "グ": "ぐ", "ゲ": "げ", "ゴ": "ご", "ザ": "ざ", "ジ": "じ", "ズ": "ず", "ゼ": "ぜ", "ゾ": "ぞ", "ダ": "だ", "ヂ": "ぢ", "ヅ": "づ", "デ": "で", "ド": "ど", "バ": "ば", "ビ": "び", "ブ": "ぶ", "ベ": "べ", "ボ": "ぼ", "パ": "ぱ", "ピ": "ぴ", "プ": "ぷ", "ペ": "ぺ", "ポ": "ぽ", "ヴ": "ゔ", "ァ": "ぁ", "ィ": "ぃ", "ゥ": "ぅ", "ェ": "ぇ", "ォ": "ぉ", "ャ": "ゃ", "ュ": "ゅ", "ョ": "ょ", "ッ": "っ" } const SINGLE_KANA_MAPPING: KanaMapping = { "あ": literal('a'), "い": literal('i'), "う": literal('u'), "え": literal('e'), "お": literal('o'), "か": literal('ka'), "き": literal('ki'), "く": literal('ku'), "け": literal('ke'), "こ": literal('ko'), "さ": literal('sa'), "し": shi(), "す": literal('su'), "せ": literal('se'), "そ": literal('so'), "た": literal('ta'), "ち": chi(), "つ": tsu(), "て": literal('te'), "と": literal('to'), "な": literal('na'), "に": literal('ni'), "ぬ": literal('nu'), "ね": literal('ne'), "の": literal('no'), "は": literal('ha'), "ひ": literal('hi'), "ふ": fu(), "へ": literal('he'), "ほ": literal('ho'), "ま": literal('ma'), "み": literal('mi'), "む": literal('mu'), "め": literal('me'), "も": literal('mo'), "や": literal('ya'), "ゆ": literal('yu'), "よ": literal('yo'), "ら": literal('ra'), "り": literal('ri'), "る": literal('ru'), "れ": literal('re'), "ろ": literal('ro'), "わ": literal('wa'), "ゐ": literal('i'), "ゑ": literal('e'), "を": literal('wo'), "ん": literal('n'), "が": literal('ga'), "ぎ": literal('gi'), "ぐ": literal('gu'), "げ": literal('ge'), "ご": literal('go'), "ざ": literal('za'), "じ": ji(), "ず": literal('zu'), "ぜ": literal('ze'), "ぞ": literal('zo'), "だ": literal('da'), "ぢ": literal('di'), "づ": literal('du'), "で": literal('de'), "ど": literal('do'), "ば": literal('ba'), "び": literal('bi'), "ぶ": literal('bu'), "べ": literal('be'), "ぼ": literal('bo'), "ぱ": literal('pa'), "ぴ": literal('pi'), "ぷ": literal('pu'), "ぺ": literal('pe'), "ぽ": literal('po'), "ゔ": literal('vu'), "ー": literal('-'), " ": WHITESPACE }; 'abcdefghijklmnopqrstuvwxyz'.split('').forEach(letter => { SINGLE_KANA_MAPPING[letter] = literal(letter); }); [ ['ぁ', 'あ'], ['ぃ', 'い'], ['ぅ', 'う'], ['ぇ', 'え'], ['ぉ', 'お'], ['ヵ', 'か'] ].forEach(pair => { let [ small, big ] = pair; SINGLE_KANA_MAPPING[small] = smallKana(SINGLE_KANA_MAPPING[big]); }); const DOUBLE_KANA_MAPPING: KanaMapping = { "きゃ": literal('kya', 0), "きゅ": literal('kyu', 0), "きょ": literal('kyo', 0), "しゃ": sh('a'), "しゅ": sh('u'), "しょ": sh('o'), "ちゃ": ch('a'), "ちゅ": ch('u'), "ちょ": ch('o'), "にゃ": literal('nya', 0), "にゅ": literal('nyu', 0), "にょ": literal('nyo', 0), "ひゃ": literal('hya', 0), "ひゅ": literal('hyu', 0), "ひょ": literal('hyo', 0), "みゃ": literal('mya', 0), "みゅ": literal('myu', 0), "みょ": literal('myo', 0), "りゃ": literal('rya', 0), "りゅ": literal('ryu', 0), "りょ": literal('ryo', 0), "ぎゃ": literal('gya', 0), "ぎゅ": literal('gyu', 0), "ぎょ": literal('gyo', 0), "じゃ": j('a'), "じゅ": j('u'), "じょ": j('o'), "ぢゃ": literal('dya', 0), "ぢゅ": literal('dyu', 0), "ぢょ": literal('dyo', 0), "びゃ": literal('bya', 0), "びゅ": literal('byu', 0), "びょ": literal('byo', 0), "ぴゃ": literal('pya', 0), "ぴゅ": literal('pyu', 0), "ぴょ": literal('pyo', 0), "ふぁ": literal('fa', 0), "ふぃ": literal('fi', 0), "ふぇ": literal('fe', 0), "ふぉ": literal('fo', 0), "ゔぁ": literal('va', 0), "ゔぃ": literal('vi', 0), "ゔぇ": literal('ve', 0), "ゔぉ": literal('vo', 0) } const TRIPLE_KANA_MAPPING: KanaMapping = {}; [ "か", "き", "く", "け", "こ", "さ", "し", "す", "せ", "そ", "た", "ち", "つ", "て", "と", "は", "ひ", "ふ", "へ", "ほ", "が", "ぎ", "ぐ", "げ", "ご", "ざ", "じ", "ず", "ぜ", "ぞ", "だ", "ぢ", "づ", "で", "ど", "ば", "び", "ぶ", "べ", "ぼ", "ぱ", "ぴ", "ぷ", "ぺ", "ぽ", "ゔ" ].forEach(kana => { DOUBLE_KANA_MAPPING['っ' + kana] = smallTsu(SINGLE_KANA_MAPPING[kana]); }); [ "きゃ", "きゅ", "きょ", "しゃ", "しゅ", "しょ", "ちゃ", "ちゅ", "ちょ", "ぎゃ", "ぎゅ", "ぎょ", "じゃ", "じゅ", "じょ", "ぢゃ", "ぢゅ", "ぢょ", "びゃ", "びゅ", "びょ", "ぴゃ", "ぴゅ", "ぴょ", "ふぁ", "ふぃ", "ふぇ", "ふぉ", "ゔぁ", "ゔぃ", "ゔぇ", "ゔぉ" ].forEach(kana => { TRIPLE_KANA_MAPPING['っ' + kana] = smallTsu(DOUBLE_KANA_MAPPING[kana]); }); /** * This normalizes input for matching. All alphabet is lower-cased, katakana * is transformed to hiragana. All whitespace is now just a space. We take * care to not change the length of the string as we have to match it * one-for-one so we can display the original source kana. */ function normalizeInput(input: string): string { return input.toLowerCase().split('').map(letter => { let transform = KATAKANA_MAPPING[letter]; if (transform !== undefined) { return transform; } else if (/\s/.test(letter)) { return ' '; } else { return letter; } }).join(''); } export class KanaInputState { kana: string[]; stateMachines: StateMachine[]; currentIndex: number; constructor(input: string) { let kana: string[] = []; let machines: StateMachine[] = []; let position = 0; let mappings = [ SINGLE_KANA_MAPPING, DOUBLE_KANA_MAPPING, TRIPLE_KANA_MAPPING ] // we pad the input so checking 3 at a time is simpler let normalized = normalizeInput(input) + ' '; while (position < input.length) { // we check substrings of length 3, 2, then 1 for (let i = 3; i > 0; --i) { let original = input.substr(position, i); let segment = normalized.substr(position, i); let machine = mappings[i - 1][segment]; if (machine != undefined) { kana.push(original); let nextMachine = machine.clone(); if (machines.length > 0) { let prevMachine = machines[machines.length - 1]; prevMachine.nextMachine = nextMachine; } machines.push(nextMachine); position += i - 1; break; } } // even if we don't find a match, keep progressing // unmapped characters will be ignored position += 1; } this.kana = kana; this.stateMachines = machines; this.currentIndex = 0; } map(func: (s: string, m: StateMachine) => T): T[] { let result: T[] = []; for (let i = 0; i < this.kana.length; ++i) { result.push(func(this.kana[i], this.stateMachines[i])); } return result; } handleInput(input: string): boolean { if (this.currentIndex >= this.stateMachines.length) return false; let currentMachine = this.stateMachines[this.currentIndex]; currentMachine.transition(input); while (currentMachine.isFinished()) { this.currentIndex += 1; currentMachine = this.stateMachines[this.currentIndex]; if (currentMachine == null) { return true; } } return this.currentIndex >= this.stateMachines.length; } getRemainingInput(): string { let remaining = ''; for (let i = this.currentIndex; i < this.stateMachines.length; ++i) { remaining += this.stateMachines[i].getDisplay(); } return remaining; } } }