Skip to content

Commit

Permalink
Refactoring #271
Browse files Browse the repository at this point in the history
  • Loading branch information
tadashi-aikawa committed Feb 4, 2024
1 parent e62ae9e commit 6989c41
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 54 deletions.
4 changes: 4 additions & 0 deletions src/tokenizer/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ import type { Settings } from "../setting/settings";

export type TrimTarget = "input" | "indexing";

export interface FactoryArgs {
treatUnderscoreAsPartOfWord?: boolean;
}

export interface Tokenizer {
tokenize(content: string, raw?: boolean): string[];
recursiveTokenize(content: string): { word: string; offset: number }[];
Expand Down
40 changes: 40 additions & 0 deletions src/tokenizer/tokenizers/AbstractTokenizer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import { ExhaustiveError } from "../../errors";
import {
type FactoryArgs,
type Tokenizer,
type TrimTarget,
} from "../tokenizer";

const INPUT_TRIM_CHAR_PATTERN = /[\n\t\[\]$/:?!=()<>"',|;*~ `_“„«»‹›‚‘’”]/g;
const INDEXING_TRIM_CHAR_PATTERN = /[\n\t\[\]/:?!=()<>"',|;*~ `_“„«»‹›‚‘’”]/g;

export abstract class AbstractTokenizer implements Tokenizer {
protected inputTrimCharPattern: RegExp;
protected indexingTrimCharPattern: RegExp;

constructor(_args?: FactoryArgs) {
this.inputTrimCharPattern = INPUT_TRIM_CHAR_PATTERN;
this.indexingTrimCharPattern = INDEXING_TRIM_CHAR_PATTERN;
}

getTrimPattern(target: TrimTarget): RegExp {
switch (target) {
case "input":
return this.inputTrimCharPattern;
case "indexing":
return this.indexingTrimCharPattern;
default:
throw new ExhaustiveError(target);
}
}

shouldIgnoreOnCurrent(_str: string): boolean {
return false;
}

abstract tokenize(content: string, raw?: boolean): string[];

abstract recursiveTokenize(
content: string
): { word: string; offset: number }[];
}
16 changes: 5 additions & 11 deletions src/tokenizer/tokenizers/ArabicTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
import { ExhaustiveError } from "../../errors";
import type { TrimTarget } from "../tokenizer";
import type { FactoryArgs } from "../tokenizer";
import { DefaultTokenizer } from "./DefaultTokenizer";

const INPUT_ARABIC_TRIM_CHAR_PATTERN = /[\n\t\[\]/:?!=()<>"'.,|;*~ `،؛]/g;
const INDEXING_ARABIC_TRIM_CHAR_PATTERN = /[\n\t\[\]$/:?!=()<>"'.,|;*~ `،؛]/g;

export class ArabicTokenizer extends DefaultTokenizer {
getTrimPattern(target: TrimTarget): RegExp {
switch (target) {
case "input":
return INPUT_ARABIC_TRIM_CHAR_PATTERN;
case "indexing":
return INDEXING_ARABIC_TRIM_CHAR_PATTERN;
default:
throw new ExhaustiveError(target);
}
constructor(_args?: FactoryArgs) {
super();
this.inputTrimCharPattern = INPUT_ARABIC_TRIM_CHAR_PATTERN;
this.indexingTrimCharPattern = INDEXING_ARABIC_TRIM_CHAR_PATTERN;
}
}
13 changes: 2 additions & 11 deletions src/tokenizer/tokenizers/ChineseTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import { type Tokenizer, type TrimTarget } from "../tokenizer";
import chineseTokenizer from "chinese-tokenizer";
import { getTrimPattern } from "./DefaultTokenizer";
import { AbstractTokenizer } from "./AbstractTokenizer";

/**
* Chinese needs original logic.
*/
export class ChineseTokenizer implements Tokenizer {
export class ChineseTokenizer extends AbstractTokenizer {
_tokenize: ReturnType<typeof chineseTokenizer.load>;

static create(dict: string): ChineseTokenizer {
Expand Down Expand Up @@ -41,12 +40,4 @@ export class ChineseTokenizer implements Tokenizer {

return ret;
}

getTrimPattern(target: TrimTarget): RegExp {
return getTrimPattern(target);
}

shouldIgnoreOnCurrent(str: string): boolean {
return false;
}
}
28 changes: 3 additions & 25 deletions src/tokenizer/tokenizers/DefaultTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,26 +1,12 @@
import { type Tokenizer, type TrimTarget } from "../tokenizer";
import { splitRaw } from "../../util/strings";
import { ExhaustiveError } from "../../errors";
import type { FactoryArgs } from "../tokenizer";
import { AbstractTokenizer } from "./AbstractTokenizer";

function pickTokens(content: string, trimPattern: RegExp): string[] {
return content.split(trimPattern).filter((x) => x !== "");
}

const INPUT_TRIM_CHAR_PATTERN = /[\n\t\[\]$/:?!=()<>"',|;*~ `_“„«»‹›‚‘’”]/g;
const INDEXING_TRIM_CHAR_PATTERN = /[\n\t\[\]/:?!=()<>"',|;*~ `_“„«»‹›‚‘’”]/g;

export function getTrimPattern(target: TrimTarget): RegExp {
switch (target) {
case "input":
return INPUT_TRIM_CHAR_PATTERN;
case "indexing":
return INDEXING_TRIM_CHAR_PATTERN;
default:
throw new ExhaustiveError(target);
}
}

export class DefaultTokenizer implements Tokenizer {
export class DefaultTokenizer extends AbstractTokenizer {
tokenize(content: string, raw?: boolean): string[] {
const tokens = raw
? Array.from(splitRaw(content, this.getTrimPattern("indexing"))).filter(
Expand All @@ -44,12 +30,4 @@ export class DefaultTokenizer implements Tokenizer {
})),
];
}

getTrimPattern(target: TrimTarget): RegExp {
return getTrimPattern(target);
}

shouldIgnoreOnCurrent(str: string): boolean {
return false;
}
}
9 changes: 2 additions & 7 deletions src/tokenizer/tokenizers/JapaneseTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import TinySegmenter from "../../external/tiny-segmenter";
import { type Tokenizer, type TrimTarget } from "../tokenizer";
import { joinNumberWithSymbol } from "../../util/strings";
import { getTrimPattern } from "./DefaultTokenizer";
import { AbstractTokenizer } from "./AbstractTokenizer";
// @ts-ignore
const segmenter = new TinySegmenter();

Expand All @@ -15,7 +14,7 @@ function pickTokensAsJapanese(content: string, trimPattern: RegExp): string[] {
/**
* Japanese needs original logic.
*/
export class JapaneseTokenizer implements Tokenizer {
export class JapaneseTokenizer extends AbstractTokenizer {
tokenize(content: string, raw?: boolean): string[] {
return pickTokensAsJapanese(
content,
Expand Down Expand Up @@ -46,10 +45,6 @@ export class JapaneseTokenizer implements Tokenizer {
return ret;
}

getTrimPattern(target: TrimTarget): RegExp {
return getTrimPattern(target);
}

shouldIgnoreOnCurrent(str: string): boolean {
return Boolean(str.match(/^[ぁ-んa-zA-Z。、ー ]*$/));
}
Expand Down

0 comments on commit 6989c41

Please sign in to comment.