Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix URL and DOI parsing when ligatures are involved
Browse files Browse the repository at this point in the history
mrtcode committed Oct 17, 2024
1 parent 95d08fe commit 80dd982
Showing 2 changed files with 13 additions and 8 deletions.
2 changes: 2 additions & 0 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
@@ -3116,6 +3116,8 @@ class PartialEvaluator {
// ligatures and more importantly avoids 'e\u00be' being converted into 'e \u0301'
// which is quite common in Spanish author names and because of the space prevents
// author name recognition
// NOTE: THIS CAN STILL HAVE DECOMPOSED LIGATURES IF THE FONT HAS ITS OWN CHARACTER MAPPING,
// THEREFORE CONSIDER DITCHING THIS PROPERTY
u: glyphUnicode.length === 1 ? glyphUnicode : glyph.unicode,
rect,
fontSize,
19 changes: 11 additions & 8 deletions src/core/module/link/parsed-overlays.js
Original file line number Diff line number Diff line change
@@ -44,26 +44,29 @@ export function getParsedOverlays(chars) {
let text = '';
for (let j = sequence.from; j <= sequence.to; j++) {
let char = chars[j];
// Use the original char to avoid decomposed ligatures increasing match length
text += char.u;
// Only add single character even if it maps to multiple
// characters (ligature). Also make sure char.c isn't empty,
// even though that might never happen
text += char.c[0] || '_';
}
let match = text.match(urlRegExp);
if (match) {
let url = match[0];
let from = sequence.from + match.index;
let to = from + match[0].length - 1;
let url = chars.slice(from, to).map(x => x.c).join('');
if (url.includes('@')) {
continue;
}
url = url.replace(/[.)]*$/, '');
let from = sequence.from + match.index;
let to = from + url.length;
links.push({ from, to, url });
}
match = text.match(doiRegExp);
if (match) {
let from = sequence.from + match.index;
let to = from + match[0].length;
let url = 'https://doi.org/' + encodeURIComponent(match[0]);
links.push({ from, to, text: match[0], url });
let to = from + match[0].length - 1;
let doi = chars.slice(from, to).map(x => x.c).join('');
let url = 'https://doi.org/' + encodeURIComponent(doi);
links.push({ from, to, text: doi, url });
continue;
}
}

0 comments on commit 80dd982

Please sign in to comment.