Skip to content

Commit

Permalink
Tighter integration between text matching/trimming/highlighting
Browse files Browse the repository at this point in the history
When we trim a text around a pattern match, and later we try
to highlight the match, pass the information along
from the trimming phase to the highlighting phase.

This avoids a round of unnecessary text matching, and fixes the corner case
when the trimmed part is actually shorter than the pattern.
  • Loading branch information
csillag committed May 10, 2024
1 parent 756b4a1 commit a6f8cc8
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 31 deletions.
9 changes: 2 additions & 7 deletions src/app/components/HighlightedText/HighlightedTrimmedText.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,6 @@ type HighlightedTrimmedTextProps = {
*/
export const HighlightedTrimmedText: FC<HighlightedTrimmedTextProps> = props => {
const { text, pattern, fragmentLength, options } = props
return (
<HighlightedText
text={trimAroundMatch(text, pattern, { fragmentLength })}
pattern={pattern}
options={options}
/>
)
const { part, match } = trimAroundMatch(text, pattern, { fragmentLength })
return <HighlightedText text={part} pattern={pattern} part={match} options={options} />
}
38 changes: 29 additions & 9 deletions src/app/components/HighlightedText/index.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import * as React from 'react'
import { findTextMatch, NormalizerOptions } from './text-matching'
import { MatchInfo, findTextMatch, NO_MATCH, NormalizerOptions } from './text-matching'
import { FC } from 'react'
import { SxProps } from '@mui/material/styles'
import Box from '@mui/material/Box'
Expand Down Expand Up @@ -46,6 +46,14 @@ interface HighlightedTextProps {
*/
pattern: string | undefined

/**
* Instructions about which part to highlight.
*
* If not given, we will just search for the pattern.
* If given, this will be executed, and the pattern will not even be considered.
*/
part?: MatchInfo

/**
* Options for highlighting (case sensitivity, styling, etc.)
*
Expand All @@ -57,19 +65,31 @@ interface HighlightedTextProps {
/**
* Display a text, with potential pattern matches highlighted with html MARKs
*/
export const HighlightedText: FC<HighlightedTextProps> = ({ text, pattern, options = defaultHighlight }) => {
export const HighlightedText: FC<HighlightedTextProps> = ({
text,
pattern,
part,
options = defaultHighlight,
}) => {
const { sx = defaultHighlightStyle, findOptions = {} } = options
const match = findTextMatch(text, [pattern], findOptions)

return text === undefined ? undefined : match ? (
// Have we been told what to highlight exactly? If not, look for the pattern
const task = part ?? findTextMatch(text, [pattern], findOptions)

if (text === undefined) return undefined // Nothing to display
if (task === NO_MATCH) return text // We don't have to highlight anything

const beginning = text.substring(0, task.startPos)
const focus = text.substring(task.startPos, task.endPos)
const end = text.substring(task.endPos)

return (
<>
{text.substring(0, match.startPos)}
{beginning}
<Box component="mark" sx={sx}>
{text.substring(match.startPos, match.startPos + match.searchText.length)}
{focus}
</Box>
{text.substring(match.startPos + match.searchText.length)}
{end}
</>
) : (
text
)
}
32 changes: 24 additions & 8 deletions src/app/components/HighlightedText/text-matching.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@ export type { NormalizerOptions } from './text-normalization'
/**
* Store info about where did we found the pattern inside the corpus
*/
export interface MatchInfo {
searchText: string
export interface PositiveMatchInfo {
startPos: number
endPos: number
}

export const NO_MATCH = 'NO_MATCH'

export type MatchInfo = PositiveMatchInfo | typeof NO_MATCH

/**
* Identify pattern matches within a corpus, also considering normalization
*
Expand All @@ -19,20 +23,32 @@ export const findTextMatch = (
rawCorpus: string | null | undefined,
search: (string | undefined)[],
options: NormalizerOptions = {},
): MatchInfo | undefined => {
): MatchInfo => {
const normalizedCorpus = normalizeTextForSearch(rawCorpus || '', options)
const matches: MatchInfo[] = search
const matches: PositiveMatchInfo[] = search
.filter((s): s is string => !!s)
.map(rawPattern => {
const normalizedPattern = normalizeTextForSearch(rawPattern!, options)
const matchStart = normalizedCorpus.indexOf(normalizedPattern)
return matchStart !== -1
? {
searchText: rawPattern,
startPos: matchStart,
endPos: matchStart + rawPattern.length,
}
: undefined
: 'NO_MATCH'
})
.filter((m): m is MatchInfo => !!m)
return matches[0]
.filter((m): m is PositiveMatchInfo => m !== NO_MATCH)
return matches[0] ?? NO_MATCH
}

/**
* Check if a pattern matches within a corpus, also considering normalization
*
* NOTE: depending on normalization options, the string length can change,
* and in that case, match position can be incorrect.
*/
export const hasTextMatch = (
rawCorpus: string | null | undefined,
search: (string | undefined)[],
options: NormalizerOptions = {},
): boolean => findTextMatch(rawCorpus, search, options) !== NO_MATCH
31 changes: 24 additions & 7 deletions src/app/components/HighlightedText/text-trimming.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { findTextMatch, NormalizerOptions } from './text-matching'
import { MatchInfo, findTextMatch, NO_MATCH, NormalizerOptions } from './text-matching'

export interface TrimAroundOptions extends NormalizerOptions {
/**
Expand Down Expand Up @@ -29,31 +29,48 @@ export function trimAroundMatch(
corpus: string | undefined,
pattern: string | undefined,
options: TrimAroundOptions = {},
): string | undefined {
): {
part: string | undefined
match: MatchInfo
} {
const { fragmentLength = 80, ...matchOptions } = options

if (!corpus) {
// there is nothing to see here
return undefined
return { part: undefined, match: NO_MATCH }
}

// do we have a match?
const match = pattern ? findTextMatch(corpus, [pattern], matchOptions) : undefined
const match = pattern ? findTextMatch(corpus, [pattern], matchOptions) : NO_MATCH

if (corpus.length <= fragmentLength) {
// the whole corpus fits into the max size, no need to cut.
return corpus
return { part: corpus, match }
}

// how much extra space do we have?
const buffer = fragmentLength - (pattern || '').length

const matchStart = match?.startPos ?? 0
const matchStart = match === NO_MATCH ? 0 : match.startPos

// We will start before the start of the match, by buffer / 2 chars
const startPos = Math.max(Math.min(matchStart - Math.floor(buffer / 2), corpus.length - fragmentLength), 0)
const endPos = Math.min(startPos + fragmentLength, corpus.length)

// Do the trimming
const prefix = startPos ? '…' : ''
const postFix = endPos < corpus.length - 1 ? '…' : ''
const part = prefix + corpus.substring(startPos, endPos) + postFix

// compile the result
return (startPos ? '…' : '') + corpus.substring(startPos, endPos) + (endPos < corpus.length - 1 ? '…' : '')
return {
part,
match:
match === NO_MATCH
? NO_MATCH
: {
startPos: Math.max(0, matchStart - startPos + prefix.length),
endPos: Math.min(part.length, match.endPos - startPos + prefix.length),
},
}
}

0 comments on commit a6f8cc8

Please sign in to comment.