123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510 |
- /**
- * This module is mainly for handling romaji input to match the provided kana
- * input. While most kana map one-to-one with romaji, some kana have multiple
- * ways to be inputted. In addition, we also have to handle っ which causes the
- * next consonant to be repeated.
- *
- * The state management is done by having a state machine for each kana and it
- * should handle all possible variations of the romaji to be inputted.
- * Additionally, it also keeps track of what is left to be input, and adjusts
- * itself accordingly if an alternative romaji was used.
- *
- * One of the key considerations is handling っ. It doesn't have a spelling in
- * and of itself, but just modifies the state machine that will come after it.
- * Intermediate states need to be created and care should be given in what shows
- * up in the display.
- */
- /// <reference path="state.ts" />
- namespace kana {
- import State = state.State;
- import StateMachine = state.StateMachine;
- import TransitionResult = state.TransitionResult;
- import t = state.makeTransition;
- function literal(source: string, ...extraBoundaries: number[]): StateMachine {
- let transitions: state.Transition[] = [];
- for (let i = 0; i < source.length; ++i) {
- let from = source.substring(i);
- let input = source.charAt(i);
- let to = source.substring(i+1);
- let boundary = i === (source.length - 1) || extraBoundaries.indexOf(i) >= 0;
- transitions.push(t(from, input, to, boundary));
- }
- return state.buildFromTransitions(source, transitions);
- }
- function shi(): StateMachine {
- return state.buildFromTransitions('shi', [
- t('shi', 's', 'hi'),
- t('hi', 'h', 'i'),
- t('hi', 'i', '', true),
- t('i', 'i', '', true)
- ]);
- }
- function chi(): StateMachine {
- return state.buildFromTransitions('chi', [
- t('chi', 'c', 'hi'),
- t('chi', 't', 'i'),
- t('hi', 'h', 'i'),
- t('i', 'i', '', true)
- ]);
- }
- function tsu(): StateMachine {
- return state.buildFromTransitions('tsu', [
- t('tsu', 't', 'su'),
- t('su', 's', 'u'),
- t('su', 'u', '', true),
- t('u', 'u', '', true)
- ]);
- }
- function fu(): StateMachine {
- return state.buildFromTransitions('fu', [
- t('fu', 'f', 'u'),
- t('fu', 'h', 'u'),
- t('u', 'u', '', true)
- ]);
- }
- function ji(): StateMachine {
- return state.buildFromTransitions('ji', [
- t('ji', 'j', 'i'),
- t('ji', 'z', 'i'),
- t('i', 'i', '', true)
- ]);
- }
- function sh(end: string): StateMachine {
- let source = 'sh' + end;
- let middle = 'h' + end;
- return state.buildFromTransitions(source, [
- t(source, 's', middle, true),
- t(middle, 'h', end),
- t(middle, 'y', end),
- t(end, end, '', true)
- ]);
- }
- function ch(end: string): StateMachine {
- let source = 'ch' + end;
- let middle = 'h' + end;
- let altMiddle = 'y' + end;
- return state.buildFromTransitions(source, [
- t(source, 'c', middle),
- t(middle, 'h', end, true),
- t(source, 't', altMiddle, true),
- t(altMiddle, 'y', end),
- t(end, end, '', true)
- ]);
- }
- function j(end: string): StateMachine {
- let source = 'j' + end;
- let altMiddle = 'y' + end;
- return state.buildFromTransitions(source, [
- t(source, 'j', end, true),
- t(source, 'z', altMiddle),
- t(end, 'y', end),
- t(altMiddle, 'y', end, true),
- t(end, end, '', true)
- ]);
- }
- function smallTsu(base: StateMachine): StateMachine {
- let { display, transitions } = base.initialState;
- let newState = new State(display.charAt(0) + display);
- Object.keys(transitions).forEach(k => {
- let [nextState, _] = transitions[k];
- let intermediateDisplay = k + nextState.display;
- let intermediateState = new State(intermediateDisplay);
- intermediateState.addTransition(k, nextState);
- newState.addTransition(k, intermediateState, true);
- })
- return new StateMachine(newState, base.finalState);
- }
- function smallKana(base: StateMachine): StateMachine {
- let newState = base.initialState.clone();
- newState.addTransition('l', base.initialState);
- newState.addTransition('x', base.initialState);
- return new StateMachine(newState, base.finalState);
- }
- interface KanaMapping {
- [index: string]: StateMachine
- }
- interface StringMapping {
- [index: string]: string
- }
- const WHITESPACE = state.buildFromTransitions('_', [
- t('_', '_', ''),
- t('_', ' ', '')
- ]);
- const KATAKANA_MAPPING: StringMapping = {
- "ア": "あ",
- "イ": "い",
- "ウ": "う",
- "エ": "え",
- "オ": "お",
- "カ": "か",
- "キ": "き",
- "ク": "く",
- "ケ": "け",
- "コ": "こ",
- "サ": "さ",
- "シ": "し",
- "ス": "す",
- "セ": "せ",
- "ソ": "そ",
- "タ": "た",
- "チ": "ち",
- "ツ": "つ",
- "テ": "て",
- "ト": "と",
- "ナ": "な",
- "ニ": "に",
- "ヌ": "ぬ",
- "ネ": "ね",
- "ノ": "の",
- "ハ": "は",
- "ヒ": "ひ",
- "フ": "ふ",
- "ヘ": "へ",
- "ホ": "ほ",
- "マ": "ま",
- "ミ": "み",
- "ム": "む",
- "メ": "め",
- "モ": "も",
- "ヤ": "や",
- "ユ": "ゆ",
- "ヨ": "よ",
- "ラ": "ら",
- "リ": "り",
- "ル": "る",
- "レ": "れ",
- "ロ": "ろ",
- "ワ": "わ",
- "ヰ": "ゐ",
- "ヱ": "ゑ",
- "ヲ": "を",
- "ン": "ん",
- "ガ": "が",
- "ギ": "ぎ",
- "グ": "ぐ",
- "ゲ": "げ",
- "ゴ": "ご",
- "ザ": "ざ",
- "ジ": "じ",
- "ズ": "ず",
- "ゼ": "ぜ",
- "ゾ": "ぞ",
- "ダ": "だ",
- "ヂ": "ぢ",
- "ヅ": "づ",
- "デ": "で",
- "ド": "ど",
- "バ": "ば",
- "ビ": "び",
- "ブ": "ぶ",
- "ベ": "べ",
- "ボ": "ぼ",
- "パ": "ぱ",
- "ピ": "ぴ",
- "プ": "ぷ",
- "ペ": "ぺ",
- "ポ": "ぽ",
- "ヴ": "ゔ",
- "ァ": "ぁ",
- "ィ": "ぃ",
- "ゥ": "ぅ",
- "ェ": "ぇ",
- "ォ": "ぉ",
- "ャ": "ゃ",
- "ュ": "ゅ",
- "ョ": "ょ",
- "ッ": "っ"
- }
- const SINGLE_KANA_MAPPING: KanaMapping = {
- "あ": literal('a'),
- "い": literal('i'),
- "う": literal('u'),
- "え": literal('e'),
- "お": literal('o'),
- "か": literal('ka'),
- "き": literal('ki'),
- "く": literal('ku'),
- "け": literal('ke'),
- "こ": literal('ko'),
- "さ": literal('sa'),
- "し": shi(),
- "す": literal('su'),
- "せ": literal('se'),
- "そ": literal('so'),
- "た": literal('ta'),
- "ち": chi(),
- "つ": tsu(),
- "て": literal('te'),
- "と": literal('to'),
- "な": literal('na'),
- "に": literal('ni'),
- "ぬ": literal('nu'),
- "ね": literal('ne'),
- "の": literal('no'),
- "は": literal('ha'),
- "ひ": literal('hi'),
- "ふ": fu(),
- "へ": literal('he'),
- "ほ": literal('ho'),
- "ま": literal('ma'),
- "み": literal('mi'),
- "む": literal('mu'),
- "め": literal('me'),
- "も": literal('mo'),
- "や": literal('ya'),
- "ゆ": literal('yu'),
- "よ": literal('yo'),
- "ら": literal('ra'),
- "り": literal('ri'),
- "る": literal('ru'),
- "れ": literal('re'),
- "ろ": literal('ro'),
- "わ": literal('wa'),
- "ゐ": literal('i'),
- "ゑ": literal('e'),
- "を": literal('wo'),
- "ん": literal('n'),
- "が": literal('ga'),
- "ぎ": literal('gi'),
- "ぐ": literal('gu'),
- "げ": literal('ge'),
- "ご": literal('go'),
- "ざ": literal('za'),
- "じ": ji(),
- "ず": literal('zu'),
- "ぜ": literal('ze'),
- "ぞ": literal('zo'),
- "だ": literal('da'),
- "ぢ": literal('di'),
- "づ": literal('du'),
- "で": literal('de'),
- "ど": literal('do'),
- "ば": literal('ba'),
- "び": literal('bi'),
- "ぶ": literal('bu'),
- "べ": literal('be'),
- "ぼ": literal('bo'),
- "ぱ": literal('pa'),
- "ぴ": literal('pi'),
- "ぷ": literal('pu'),
- "ぺ": literal('pe'),
- "ぽ": literal('po'),
- "ゔ": literal('vu'),
- "ー": literal('-'),
- " ": WHITESPACE
- };
- 'abcdefghijklmnopqrstuvwxyz'.split('').forEach(letter => {
- SINGLE_KANA_MAPPING[letter] = literal(letter);
- });
- [
- ['ぁ', 'あ'],
- ['ぃ', 'い'],
- ['ぅ', 'う'],
- ['ぇ', 'え'],
- ['ぉ', 'お'],
- ['ヵ', 'か']
- ].forEach(pair => {
- let [ small, big ] = pair;
- SINGLE_KANA_MAPPING[small] = smallKana(SINGLE_KANA_MAPPING[big]);
- });
- const DOUBLE_KANA_MAPPING: KanaMapping = {
- "きゃ": literal('kya', 0),
- "きゅ": literal('kyu', 0),
- "きょ": literal('kyo', 0),
- "しゃ": sh('a'),
- "しゅ": sh('u'),
- "しょ": sh('o'),
- "ちゃ": ch('a'),
- "ちゅ": ch('u'),
- "ちょ": ch('o'),
- "にゃ": literal('nya', 0),
- "にゅ": literal('nyu', 0),
- "にょ": literal('nyo', 0),
- "ひゃ": literal('hya', 0),
- "ひゅ": literal('hyu', 0),
- "ひょ": literal('hyo', 0),
- "みゃ": literal('mya', 0),
- "みゅ": literal('myu', 0),
- "みょ": literal('myo', 0),
- "りゃ": literal('rya', 0),
- "りゅ": literal('ryu', 0),
- "りょ": literal('ryo', 0),
- "ぎゃ": literal('gya', 0),
- "ぎゅ": literal('gyu', 0),
- "ぎょ": literal('gyo', 0),
- "じゃ": j('a'),
- "じゅ": j('u'),
- "じょ": j('o'),
- "ぢゃ": literal('dya', 0),
- "ぢゅ": literal('dyu', 0),
- "ぢょ": literal('dyo', 0),
- "びゃ": literal('bya', 0),
- "びゅ": literal('byu', 0),
- "びょ": literal('byo', 0),
- "ぴゃ": literal('pya', 0),
- "ぴゅ": literal('pyu', 0),
- "ぴょ": literal('pyo', 0),
- "ふぁ": literal('fa', 0),
- "ふぃ": literal('fi', 0),
- "ふぇ": literal('fe', 0),
- "ふぉ": literal('fo', 0),
- "ゔぁ": literal('va', 0),
- "ゔぃ": literal('vi', 0),
- "ゔぇ": literal('ve', 0),
- "ゔぉ": literal('vo', 0)
- }
- const TRIPLE_KANA_MAPPING: KanaMapping = {};
- [
- "か", "き", "く", "け", "こ",
- "さ", "し", "す", "せ", "そ",
- "た", "ち", "つ", "て", "と",
- "は", "ひ", "ふ", "へ", "ほ",
- "が", "ぎ", "ぐ", "げ", "ご",
- "ざ", "じ", "ず", "ぜ", "ぞ",
- "だ", "ぢ", "づ", "で", "ど",
- "ば", "び", "ぶ", "べ", "ぼ",
- "ぱ", "ぴ", "ぷ", "ぺ", "ぽ",
- "ゔ"
- ].forEach(kana => {
- DOUBLE_KANA_MAPPING['っ' + kana] = smallTsu(SINGLE_KANA_MAPPING[kana]);
- });
- [
- "きゃ", "きゅ", "きょ",
- "しゃ", "しゅ", "しょ",
- "ちゃ", "ちゅ", "ちょ",
- "ぎゃ", "ぎゅ", "ぎょ",
- "じゃ", "じゅ", "じょ",
- "ぢゃ", "ぢゅ", "ぢょ",
- "びゃ", "びゅ", "びょ",
- "ぴゃ", "ぴゅ", "ぴょ",
- "ふぁ", "ふぃ", "ふぇ", "ふぉ",
- "ゔぁ", "ゔぃ", "ゔぇ", "ゔぉ"
- ].forEach(kana => {
- TRIPLE_KANA_MAPPING['っ' + kana] = smallTsu(DOUBLE_KANA_MAPPING[kana]);
- });
- /**
- * This normalizes input for matching. All alphabet is lower-cased, katakana
- * is transformed to hiragana. All whitespace is now just a space. We take
- * care to not change the length of the string as we have to match it
- * one-for-one so we can display the original source kana.
- */
- function normalizeInput(input: string): string {
- return input.toLowerCase().split('').map(letter => {
- let transform = KATAKANA_MAPPING[letter];
- if (transform !== undefined) {
- return transform;
- } else if (/\s/.test(letter)) {
- return ' ';
- } else {
- return letter;
- }
- }).join('');
- }
- export class KanaInputState {
- kana: string[];
- stateMachines: StateMachine[];
- currentIndex: number;
- constructor(input: string) {
- let kana: string[] = [];
- let machines: StateMachine[] = [];
- let position = 0;
- let mappings = [
- SINGLE_KANA_MAPPING,
- DOUBLE_KANA_MAPPING,
- TRIPLE_KANA_MAPPING
- ]
- // we pad the input so checking 3 at a time is simpler
- let normalized = normalizeInput(input) + ' ';
- while (position < input.length) {
- // we check substrings of length 3, 2, then 1
- for (let i = 3; i > 0; --i) {
- let original = input.substr(position, i);
- let segment = normalized.substr(position, i);
- let machine = mappings[i - 1][segment];
- if (machine != undefined) {
- kana.push(original);
- let nextMachine = machine.clone();
- if (machines.length > 0) {
- let prevMachine = machines[machines.length - 1];
- prevMachine.nextMachine = nextMachine;
- }
- machines.push(nextMachine);
- position += i - 1;
- break;
- }
- }
- // even if we don't find a match, keep progressing
- // unmapped characters will be ignored
- position += 1;
- }
- this.kana = kana;
- this.stateMachines = machines;
- this.currentIndex = 0;
- }
- map<T>(func: (s: string, m: StateMachine) => T): T[] {
- let result: T[] = [];
- for (let i = 0; i < this.kana.length; ++i) {
- result.push(func(this.kana[i], this.stateMachines[i]));
- }
- return result;
- }
- handleInput(input: string): boolean {
- if (this.currentIndex >= this.stateMachines.length) return false;
- let currentMachine = this.stateMachines[this.currentIndex];
- currentMachine.transition(input);
- while (currentMachine.isFinished()) {
- this.currentIndex += 1;
- currentMachine = this.stateMachines[this.currentIndex];
- if (currentMachine == null) {
- return true;
- }
- }
- return this.currentIndex >= this.stateMachines.length;
- }
- getRemainingInput(): string {
- let remaining = '';
- for (let i = this.currentIndex; i < this.stateMachines.length; ++i) {
- remaining += this.stateMachines[i].getDisplay();
- }
- return remaining;
- }
- }
- }
|