feat(data)!: replace 韻目原貌 with 來源

- Added a 廣韻外字 "韻" from 王三 (more to come)
nk2028 · Aug 6, 2024 · 4831fd0 · 4831fd0
1 parent da810cd
commit 4831fd0
Show file tree

Hide file tree

Showing 8 changed files with 195 additions and 112 deletions.
diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,4 @@ yarn.lock
 /prepare/data.csv
 /prepare/v2.csv
 
-/src/data/資料.ts
+/src/data/廣韻.ts
diff --git a/prepare/main.py b/prepare/main.py
@@ -90,7 +90,7 @@ def main():
             d.setdefault(編碼 + 反切 + 韻目原貌, []).append((字頭, 字頭又作, 釋義))
 
     os.makedirs('src/data', exist_ok=True)
-    with open('src/data/資料.ts', 'w', newline='') as fout:
+    with open('src/data/廣韻.ts', 'w', newline='') as fout:
         print('export default `\\', file=fout)
         for key, 各條目 in d.items():
             print(

diff --git a/src/Qieyun.ts b/src/Qieyun.ts
@@ -1,7 +1,7 @@
 export { 音韻地位 } from './lib/音韻地位';
 export type { 部分音韻屬性, 判斷規則列表, 邊緣地位種類指定 } from './lib/音韻地位';
 
-export * as 資料 from './lib/解析資料';
+export * as 資料 from './lib/資料';
 
 export * as 表達式 from './lib/常用表達式';
 

diff --git a/src/lib/壓縮表示.spec.ts b/src/lib/壓縮表示.spec.ts
@@ -1,7 +1,7 @@
 import test from 'ava';
 
 import { decode音韻編碼, encode音韻編碼 } from './壓縮表示';
-import { iter音韻地位 } from './解析資料';
+import { iter音韻地位 } from './資料';
 import { 音韻地位 } from './音韻地位';
 
 test('測試音韻編碼', t => {

diff --git a/src/lib/解析資料.ts b/src/lib/解析資料.ts
diff --git a/src/lib/資料查詢.spec.ts → src/lib/資料.spec.ts b/src/lib/資料查詢.spec.ts → src/lib/資料.spec.ts
@@ -2,7 +2,7 @@ import { readFileSync } from 'fs';
 
 import test from 'ava';
 
-import { query字頭, query音韻地位 } from './解析資料';
+import { query字頭, query音韻地位 } from './資料';
 import { 音韻地位 } from './音韻地位';
 
 test('查「東」字的反切', t => {
@@ -59,14 +59,20 @@ test('查詢「韓」字。「韓」是《廣韻》「亦作」字頭', t => {
   t.is(res[0].字頭, '𩏑');
 });
 
-test('查詢韻目原貌', t => {
-  t.is(query字頭('劒')[0]?.韻目原貌, '梵');
-  t.is(query字頭('茝').find(({ 音韻地位 }) => 音韻地位.屬於('廢韻'))?.韻目原貌, '海');
+test('查詢來源', t => {
+  t.like(
+    query字頭('茝').find(({ 音韻地位 }) => 音韻地位.屬於('廢韻')),
+    { 來源: { 文獻: '廣韻', 韻目: '海' } },
+  );
+  t.like(
+    query字頭('韻').find(({ 音韻地位 }) => 音韻地位.屬於('B類')),
+    { 來源: { 文獻: '王三', 韻目: '震' } },
+  );
 });
 
 test('根據原資料檔查詢所有字頭', t => {
   for (const line of readFileSync('prepare/data.csv', { encoding: 'utf8' }).split('\n').slice(1, -1)) {
-    const [, , 韻目原貌1, 地位描述1, 原反切1, 字頭1, 字頭又作1, 原釋義1, 釋義補充1] = line.split(',');
+    const [, , 韻目原貌, 地位描述1, 原反切1, 字頭1, 字頭又作1, 原釋義1, 釋義補充1] = line.split(',');
     if (!地位描述1) {
       continue;
     }
@@ -75,8 +81,15 @@ test('根據原資料檔查詢所有字頭', t => {
     const 音韻地位1 = 音韻地位.from描述(地位描述1);
 
     const query = (查詢字頭: string) =>
-      query字頭(查詢字頭).some(({ 字頭: 字頭2, 音韻地位: 音韻地位2, 韻目原貌: 韻目原貌2, 反切: 反切2, 釋義: 釋義2 }) => {
-        return 字頭1 === 字頭2 && 音韻地位1.等於(音韻地位2) && 韻目原貌1 == 韻目原貌2 && 反切1 === 反切2 && 釋義1 === 釋義2;
+      query字頭(查詢字頭).some(({ 字頭: 字頭2, 音韻地位: 音韻地位2, 反切: 反切2, 釋義: 釋義2, 來源 }) => {
+        return (
+          字頭1 === 字頭2 &&
+          音韻地位1.等於(音韻地位2) &&
+          反切1 === 反切2 &&
+          釋義1 === 釋義2 &&
+          來源?.文獻 === '廣韻' &&
+          來源.韻目 === 韻目原貌
+        );
       });
 
     t.true(query(字頭1), line);

diff --git a/src/lib/資料.ts b/src/lib/資料.ts
@@ -0,0 +1,170 @@
+import 資料 from '../data/廣韻';
+
+import { decode音韻編碼, encode音韻編碼 } from './壓縮表示';
+import { 音韻地位 } from './音韻地位';
+
+type 內部檢索結果 = Readonly<{ 字頭: string; 編碼: string; 反切: string | null; 釋義: string; 來源: 來源類型 | null }>;
+
+export interface 檢索結果 {
+  字頭: string;
+  音韻地位: 音韻地位;
+  /** 反切，若未用反切注音（如「音某字某聲」）則為 `null` */
+  反切: string | null;
+  釋義: string;
+  來源: 來源類型 | null;
+}
+export type 來源類型 = 廣韻來源 | 王三來源;
+export interface 廣韻來源 {
+  文獻: '廣韻';
+  韻目: string;
+  // TODO 小韻號等
+}
+export interface 王三來源 {
+  文獻: '王三';
+  韻目: string;
+  // TODO 小韻號等
+}
+
+const m字頭檢索 = new Map<string, 內部檢索結果[]>();
+const m音韻編碼檢索 = new Map<string, 內部檢索結果[]>();
+
+// NOTE This is for ensuring *invariance*(-ish) on the type of the map of `insertInto`.
+// This way, the type of `map` (`T`) is inferred first, then the other two arguments will be checked against it, rather than the types of
+// `key` and `value` dictating what the map should be like (because TypeScript sees `map` as *covariant* by default, which is not suitable
+// for mutable operations like insertion).
+type KeyOfMap<T> = T extends Map<infer K, unknown> ? K : never;
+type ValueOfMap<T> = T extends Map<unknown, infer V> ? V : never;
+type ArrayElement<T> = T extends (infer U)[] ? U : never;
+
+function insertInto<K, V, T extends Map<K, V[]> = Map<K, V[]>>(map: T, key: KeyOfMap<T>, value: ArrayElement<ValueOfMap<T>>) {
+  if (!map.has(key)) {
+    map.set(key, [value]);
+  } else {
+    map.get(key)!.push(value);
+  }
+}
+
+(function 早期廣韻外字() {
+  const 字頭 = '韻';
+  const 編碼 = encode音韻編碼(音韻地位.from描述('云合三B真去'));
+  const record = {
+    字頭,
+    編碼,
+    反切: '爲捃',
+    釋義: '為捃反音和一',
+    來源: { 文獻: '王三' as const, 韻目: '震' },
+  };
+  insertInto(m字頭檢索, 字頭, record);
+  insertInto(m音韻編碼檢索, 編碼, record);
+})();
+
+(function 解析廣韻資料() {
+  const patternOuter = /([\w$]{3})(..)(.)(.*?\n)/gu;
+  for (const [, 編碼, maybe反切, 韻目原貌, 各條目] of 資料.matchAll(patternOuter)) {
+    // '@@' is a placeholder in the original data to indicate that there is no 反切
+    const 反切 = maybe反切 === '@@' ? null : maybe反切;
+
+    const patternInner = /(.)((?:\+.)*)(.*?)[|\n]/gu;
+    for (const [, 字頭, 字頭又作, 釋義] of 各條目.matchAll(patternInner)) {
+      const record = { 字頭, 編碼, 反切, 釋義, 來源: { 文獻: '廣韻' as const, 韻目: 韻目原貌 } };
+
+      insertInto(m字頭檢索, 字頭, record);
+      for (const [, 別體] of 字頭又作.matchAll(/\+(.)/g)) {
+        insertInto(m字頭檢索, 別體, record);
+      }
+
+      insertInto(m音韻編碼檢索, 編碼, record);
+    }
+  }
+})();
+
+function 結果from內部結果(內部結果: 內部檢索結果): 檢索結果 {
+  const { 字頭, 編碼, 來源, ...rest } = 內部結果;
+  return {
+    字頭,
+    音韻地位: decode音韻編碼(編碼),
+    ...rest,
+    來源: 來源 ? { ...來源 } : null,
+  };
+}
+
+/**
+ * 遍歷內置資料中全部有字之音韻地位。
+ * @returns 迭代器，所有至少對應一個字頭的音韻地位
+ */
+export function* iter音韻地位(): IterableIterator<音韻地位> {
+  for (const 音韻編碼 of m音韻編碼檢索.keys()) {
+    yield decode音韻編碼(音韻編碼);
+  }
+}
+
+/**
+ * 由字頭查出相應的音韻地位、反切、解釋。
+ * @param 字頭 待查找的漢字
+ * @returns 陣列，每一項包含音韻地位和解釋
+ *
+ * 若查不到該字，則回傳空陣列。
+ * @example
+ * ```typescript
+ * > Qieyun.資料.query字頭('結');
+ * [ {
+ *   字頭: '結',
+ *   音韻地位: 音韻地位 { '見開四先入' },
+ *   反切: '古屑',
+ *   釋義: '締也古屑切十五',
+ *   來源: { 文獻: '廣韻', 韻目: '屑' },
+ * } ]
+ * > Qieyun.資料.query字頭('冷');
+ * [
+ *   {
+ *     字頭: '冷',
+ *     音韻地位: 音韻地位 { '來開四青平' },
+ *     反切: '郎丁',
+ *     釋義: '冷凙吳人云冰凌又力頂切',
+ *     來源: { 文獻: '廣韻', 韻目: '青' },
+ *   },
+ *   {
+ *     字頭: '冷',
+ *     音韻地位: 音韻地位 { '來開二庚上' },
+ *     反切: '魯打',
+ *     釋義: '寒也魯打切又魯頂切一',
+ *     來源: { 文獻: '廣韻', 韻目: '梗' },
+ *   },
+ *   {
+ *     字頭: '冷',
+ *     音韻地位: 音韻地位 { '來開四青上' },
+ *     反切: '力鼎',
+ *     釋義: '寒也又姓前趙錄有徐州刺史冷道字安義又盧打切',
+ *     來源: { 文獻: '廣韻', 韻目: '迥' },
+ *   },
+ * ]
+ * ```
+ */
+export function query字頭(字頭: string): 檢索結果[] {
+  return m字頭檢索.get(字頭)?.map(結果from內部結果) ?? [];
+}
+
+/**
+ * 查詢音韻地位對應的字頭、反切、解釋。
+ *
+ * @param 地位 待查詢的音韻地位
+ *
+ * @returns 陣列，每一項包含音韻地位和解釋
+ *
+ * 若音韻地位有音無字，則值為空陣列。
+ * @example
+ * ```typescript
+ * > 地位 = Qieyun.音韻地位.from描述('影開二銜去');
+ * > Qieyun.資料.query音韻地位(地位);
+ * [ {
+ *   字頭: '𪒠',
+ *   音韻地位: 音韻地位 { ''影開二銜去' },
+ *   反切: null,
+ *   解釋: '叫呼仿佛𪒠然自得音黯去聲一',
+ *   來源: { 文獻: '廣韻', 韻目: '鑑' },
+ * } ]
+ * ```
+ */
+export function query音韻地位(地位: 音韻地位): 檢索結果[] {
+  return m音韻編碼檢索.get(encode音韻編碼(地位))?.map(結果from內部結果) ?? [];
+}
diff --git a/src/lib/音韻地位.spec.ts b/src/lib/音韻地位.spec.ts
@@ -1,6 +1,6 @@
 import test from 'ava';
 
-import { iter音韻地位 } from './解析資料';
+import { iter音韻地位 } from './資料';
 import { 判斷規則列表, 邊緣地位種類指定, 音韻地位 } from './音韻地位';
 
 // 由音韻地位得出各項音韻屬性