Skip to content

Commit

Permalink
Merge pull request #38 from sillsdev/region_and_spaces_fix
Browse files Browse the repository at this point in the history
fix: langtag bug fix and region lookup (#38)
  • Loading branch information
JohnThomson authored Nov 25, 2024
2 parents 56bb927 + 12533ae commit 59e1f97
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -62,39 +62,36 @@ export function searchForLanguage(
// e.g. if querystring is "otl", then " otl" is a prefix match for " San Felipe Otlaltepec Popoloca " but not "botlikh"
const prefixMatchResults = exactMatchFuse.search(" " + queryString);

const fuzzyMatchFuse = new Fuse(spacePaddedLanguages as ILanguage[], {
const fuzzyMatchFuse = new Fuse(languages as ILanguage[], {
...baseFuseOptions,
threshold: 0.3,
});
const fuzzyMatchResults = fuzzyMatchFuse.search(queryString);

// Combine all the result lists with no duplicates, prioritizing whole word exact matches then prefix exact matches then all other fuzzy matches
const results = [];
const alreadyIncludedResultCodes = new Set();
// Use the results from the fuzzy match search, since the others will have incorrect match indices due to the space padding.
// But order the results in order of whole word matches, then prefix matches, then the rest with no duplicates
const resultsByIso639_3Code = new Map<string, FuseResult<ILanguage>>();
for (const result of fuzzyMatchResults) {
resultsByIso639_3Code.set(result.item.iso639_3_code, result);
}
const orderedResults = [];
for (const resultList of [
wholeWordMatchResults,
prefixMatchResults,
fuzzyMatchResults,
]) {
for (const result of resultList) {
if (!alreadyIncludedResultCodes.has(result.item.iso639_3_code)) {
results.push(result);
alreadyIncludedResultCodes.add(result.item.iso639_3_code);
for (const r of resultList) {
const isoCode = r.item.iso639_3_code;
const correctResult = resultsByIso639_3Code.get(isoCode);
if (correctResult) {
// this language was not already added as part of a previous subset
// (wholeWordMatchResults should be a subset of prefixMatchResults which should be a subset of fuzzyMatchResults)
orderedResults.push(correctResult);
resultsByIso639_3Code.delete(isoCode);
}
}
}

return results.map((r) => ({
...r,
// We trim off the spaces that we added above to find exact and prefix matches.
item: {
...r.item,
autonym: r.item.autonym ? r.item.autonym.trim() : undefined,
exonym: r.item.exonym.trim(),
names: r.item.names.map((n) => n.trim()),
languageSubtag: r.item.languageSubtag.trim(),
},
}));
return orderedResults;
}

// get language (not macrolanguage) with exact match on subtag
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import { expect, it, describe } from "vitest";
import { parseLangtagFromLangChooser } from "./languageTagHandling";
import {
defaultRegionForLangTag,
parseLangtagFromLangChooser,
} from "./languageTagHandling";
import { getRegionBySubtag } from "@ethnolib/find-language";
describe("Tag parsing", () => {
it("should find a language by 2 letter language subtag", () => {
Expand Down Expand Up @@ -133,3 +136,24 @@ describe("Tag parsing", () => {
);
expect(ssh_Arab_AE_x_foobar_result?.customDetails?.dialect).toEqual("foobar");
});

describe("defaultRegionForLangTag", () => {
it("should return the region for a language tag that already has a region", () => {
expect(defaultRegionForLangTag("en-Latn-US")?.name).toEqual(
"United States of America"
);
expect(defaultRegionForLangTag("en-CN-x-foobar")?.name).toEqual("China");
expect(defaultRegionForLangTag("en-JP")?.name).toEqual("Japan");
});
it("should return the region for the closest maximal equivalent of the language tag", () => {
expect(defaultRegionForLangTag("uz")?.name).toEqual("Uzbekistan");
expect(defaultRegionForLangTag("uz-Cyrl")?.name).toEqual("Uzbekistan");
expect(defaultRegionForLangTag("uz-Arab")?.name).toEqual("Afghanistan");
expect(defaultRegionForLangTag("uz-Arab-x-foobar")?.name).toEqual(
"Afghanistan"
);
expect(defaultRegionForLangTag("uz-Taml-x-foobar")?.name).toEqual(
"Uzbekistan"
);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,24 @@ export function parseLangtagFromLangChooser(
} as ICustomizableLanguageDetails,
} as IOrthography;
}

export function defaultRegionForLangTag(languageTag: string) {
// if languageTag already has a region tag in it, use that
const orthography = parseLangtagFromLangChooser(languageTag);
if (orthography?.customDetails?.region) {
return orthography.customDetails.region;
}

// Otherwise, the maximal equivalent language tag will have the region code
const languageSubtag = orthography?.language?.languageSubtag;
const scriptSubtag = orthography?.script?.code;

// Take the most specific/relevant matching maximal tag that we are able to find
const maximalTag =
getMaximalLangtag(languageTag) ||
getMaximalLangtag(`${languageSubtag}-${scriptSubtag}`) ||
getMaximalLangtag(`${languageSubtag}`) ||
"";
const maximalTagOrthography = parseLangtagFromLangChooser(maximalTag);
return maximalTagOrthography?.customDetails?.region;
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ export {
isUnlistedLanguage,
createTagFromOrthography,
parseLangtagFromLangChooser,
defaultDisplayName,
defaultRegionForLangTag,
} from "@ethnolib/language-chooser-react-hook";
export type {
IOrthography,
ICustomizableLanguageDetails,
defaultDisplayName,
} from "@ethnolib/language-chooser-react-hook";

0 comments on commit 59e1f97

Please sign in to comment.