Skip to content

Commit

Permalink
feat: parser folder
Browse files Browse the repository at this point in the history
  • Loading branch information
betterRunner committed Sep 6, 2021
1 parent 1687e02 commit 89a0440
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 88 deletions.
149 changes: 75 additions & 74 deletions src/content-scripts/parser/selection-meta.ts
Original file line number Diff line number Diff line change
@@ -1,135 +1,136 @@
import { isObject } from '../../utils/utils'
import { getNodeText } from '../../utils/dom'
import { isObject } from "@/utils/utils";
import { getNodeText } from "@/utils/dom";
import { Rect } from "@/types/common";

/// 获取一个html node的文本行数组
/**
* Get the texts list from an node.
*/
function getNodeTextList(node: DocumentFragment): string[] {
const _getQuery = (node: HTMLElement) => {
// querySelector查询格式
// {tagName}#{id}.{class1}.{class2} ...(子query) ...(孙query)
let query = ''
// querySelector query format
// {tagName}#{id}.{class1}.{class2} ...(child query) ...(grandchild query)
let query = "";
if (node.id) {
query = `#${node.id}`
query = `#${node.id}`;
}
if (node.className) {
let className = node.className
let className = node.className;
if (isObject(className)) {
className = className['baseVal']
className = (className as any)?.["baseVal"] || "";
}
// note: 暂时过滤伪类的className
// TODO: filter the pesudo className temporarily
className = className
.split(' ')
.filter((ele) => !ele.includes(':'))
.split(" ")
.filter((ele) => !ele.includes(":"))
.map((className) => `.${className}`)
.join('')
query = `${query}${className}`
.join("");
query = `${query}${className}`;
}
if (node.tagName) {
query = `${node.tagName}${query}`
query = `${node.tagName}${query}`;
}
return query
}
return query;
};

const _iter = (node: HTMLElement, arr: string[] = [], query = '') => {
// 递归找到最后一层dom,获取它的query和innerText(最后一层dom才是文本)
if (!node) return []
const _iter = (node: HTMLElement, arr: string[] = [], query = "") => {
// iter and find the most inner dom, get its `query` and `innerText` (the most inner level is the text dom)
if (!node) return [];
if (node.childNodes && node.childNodes.length) {
// console.log('children', node.childNodes)
query = `${query ? query + ' ' : ''}${_getQuery(node)}`
query = `${query ? query + " " : ""}${_getQuery(node)}`;
for (const child of Array.from(node.childNodes)) {
// console.log('child', child)
_iter(child as HTMLElement, arr, query)
_iter(child as HTMLElement, arr, query);
}
} else {
// console.log(node)
const text = getNodeText(node)
arr.push(text)
const text = getNodeText(node);
arr.push(text);
}
return arr
}
return arr;
};

let contentList = _iter((node as unknown) as HTMLElement)
return contentList
let contentList = _iter((node as unknown) as HTMLElement);
return contentList;
}

function filterDuplicateRects(rects: DOMRect[]) {
const filterRectMap = new Map<string, DOMRect>()
/**
* Filter the rects with duplicated coordinates.
*/
function filterDuplicateRects(rects: Rect[]) {
const filterRectMap = new Map<string, Rect>();
for (const rect of rects) {
const key = `x:${rect.x},y:${rect.y}`
const key = `x:${rect.x},y:${rect.y}`;
if (!filterRectMap.has(key)) {
filterRectMap.set(key, rect)
filterRectMap.set(key, rect);
} else {
const oriRect = filterRectMap.get(key) as DOMRect;
if ((oriRect.width > rect.width && rect.width > 0) || oriRect.width === 0) {
filterRectMap.set(key, rect)
const oriRect = filterRectMap.get(key) as Rect;
if (
(oriRect.width > rect.width && rect.width > 0) ||
oriRect.width === 0
) {
filterRectMap.set(key, rect);
}
}
}
return Array.from(filterRectMap.values())
return Array.from(filterRectMap.values());
}

// 过滤坐标不符合要求的rects
function filterInvalidCoorRects(rects: DOMRect[]) {
let excludeRects: DOMRect[] = []
/**
* Some rects' coordinates are invalid, filter them.
*/
function filterInvalidCoorRects(rects: Rect[]) {
let excludeRects: Rect[] = [];
for (let m = 0; m < rects.length; m++) {
for (let n = 0; n < rects.length; n++) {
if (m !== n) {
const rect1 = rects[m]
const rect2 = rects[n]
const rect1 = rects[m];
const rect2 = rects[n];
if (
rect1.x <= rect2.x &&
rect1.y <= rect2.y &&
rect1.x + rect1.width >= rect2.x + rect2.width &&
rect1.y + rect1.height >= rect2.y + rect2.height
) {
console.log('exclude', rect1, rect2)
const rect = rect2.width === 0 ? rect2 : rect1 // 如果rect2 width为0则exclude rect2,否则exclude rect1
excludeRects.push(rect)
const rect = rect2.width === 0 ? rect2 : rect1; // exclude rect2 if rect2's width = 0, otherwise exclude rect1
excludeRects.push(rect);
}
}
}
}
return rects.filter((ele) => !excludeRects.includes(ele))
return rects.filter((ele) => !excludeRects.includes(ele));
}

/**
* 获取包裹选中文本的元信息:
* 1. 矩形区域数组:包括了坐标信息的rect
* 2. 文本数组
* Get the `SelectionMeta` from current mouse selection object.
*/
export interface SelectionMeta {
rects: DOMRect[]
texts: string[]
rects: Rect[];
texts: string[];
}
export function parseRectsAndTextFromSelection(): SelectionMeta {
let rects: DOMRect[] = []
let texts: string[] = []
let rects: Rect[] = [];
let texts: string[] = [];
try {
// 借助`window.getSelection()`获取当前由文本鼠标拉取区间的对象
const selection = window.getSelection()
// use `window.getSelection()` to get the rects of selected texts
const selection = window.getSelection();
if (selection) {
const range = selection.getRangeAt(0)
const range = selection.getRangeAt(0);
if (range) {
const cloneFragment = range.cloneContents()
console.log('clone fragment', cloneFragment)
// 获取`selection`包括的文本
texts = getNodeTextList(cloneFragment)
console.log('contentList', texts)
const cloneFragment = range.cloneContents();
texts = getNodeTextList(cloneFragment);
rects = Array.from(range.getClientRects()).map((r) => ({
...r,
y: r.y + window.scrollY
}))
// 过滤: (1) 坐标重复 (2) 区域重叠 (3) 坐标不符合要求的rects
rects = filterDuplicateRects(rects)
console.log('rects', rects)
rects = filterInvalidCoorRects(rects)
console.log('rects', rects)
x: r.x,
y: r.y + window.scrollY,
width: r.width,
height: r.height,
}));
rects = filterDuplicateRects(rects);
rects = filterInvalidCoorRects(rects);
}
}
} catch (err) {
console.log(err)
console.log(err);
}
return {
rects,
texts
}
texts,
};
}
32 changes: 18 additions & 14 deletions src/content-scripts/parser/text-list.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
/**
* Format text of text list.
*/
export function getFormattedTextFromTextList(arr: string[]) {
// 整理每一行的内容:
// format lines
arr = arr.map((text) => {
if (/^[\s\n ][\s\n ]*[\s\n ]$/.test(text)) {
// 如果这一行只包含`\n`` `,则去掉` `
return text.replace(/ +/g, '')
// if this line only contains `\n` and ` `, remove ` `
return text.replace(/ +/g, "");
} else {
return text
return text;
}
})
// (1) 连续空格替换为1个空格
// (2) 连续换行替换为1个换行
// (3) `\n+\W\n+`替换为`\W`(有的分隔符会当作换行需要去掉换行)
return arr
.join('')
.replace(/ +/g, ' ')
.replace(/\n+/g, '\n')
.replace(/\n+(\W)\n+/g, '$1')
}
});
// (1) continuous spaces -> one space
// (2) continuous `\n` -> one `\n`
// (3) `\n+\W\n+` -> `\W` (some delimiter would be separated by two `\n`)
const res = arr
.join("")
.replace(/ +/g, " ")
.replace(/\n+/g, "\n")
.replace(/\n+(\W)\n+/g, "$1");
return res;
}

0 comments on commit 89a0440

Please sign in to comment.