-
Notifications
You must be signed in to change notification settings - Fork 830
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Cleaning bricks to extract text before/after a pattern (#63)
* brick to extract text before * brick for extract text after * tests for extract before and after * updated docs * changelog and bump version * fix typo * fix another typo * positive -> non-negative
- Loading branch information
1 parent
f3756ab
commit 300c564
Showing
5 changed files
with
118 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import pytest | ||
|
||
import unstructured.cleaners.extract as extract | ||
|
||
|
||
def test_get_indexed_match_raises_with_bad_index(): | ||
with pytest.raises(ValueError): | ||
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", -1) | ||
|
||
|
||
def test_get_indexed_match_raises_with_index_too_high(): | ||
with pytest.raises(ValueError): | ||
extract._get_indexed_match("BLAH BLAH BLAH", "BLAH", 4) | ||
|
||
|
||
def test_extract_text_before(): | ||
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!" | ||
assert extract.extract_text_before(text, "BLAH", 1) == "Teacher: BLAH" | ||
|
||
|
||
def test_extract_text_after(): | ||
text = "Teacher: BLAH BLAH BLAH; Student: BLAH BLAH BLAH!" | ||
assert extract.extract_text_after(text, "BLAH;", 0) == "Student: BLAH BLAH BLAH!" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "0.2.3-dev0" # pragma: no cover | ||
__version__ = "0.2.3" # pragma: no cover |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import re | ||
|
||
|
||
def _get_indexed_match(text: str, pattern: str, index: int = 0) -> re.Match: | ||
if not isinstance(index, int) or index < 0: | ||
raise ValueError(f"The index is {index}. Index must be a non-negative integer.") | ||
|
||
regex_match = None | ||
for i, result in enumerate(re.finditer(pattern, text)): | ||
if i == index: | ||
regex_match = result | ||
|
||
if regex_match is None: | ||
raise ValueError(f"Result with index {index} was not found. The largest index was {i}.") | ||
|
||
return regex_match | ||
|
||
|
||
def extract_text_before(text: str, pattern: str, index: int = 0, strip: bool = True) -> str: | ||
"""Extracts texts that occurs before the specified pattern. By default, it will use | ||
the first occurence of the pattern (index 0). Use the index kwarg to choose a different | ||
index. | ||
Input | ||
----- | ||
strip: If True, removes trailing whitespace from the extracted string | ||
""" | ||
regex_match = _get_indexed_match(text, pattern, index) | ||
start, _ = regex_match.span() | ||
before_text = text[:start] | ||
return before_text.rstrip() if strip else before_text | ||
|
||
|
||
def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = True) -> str: | ||
"""Extracts texts that occurs before the specified pattern. By default, it will use | ||
the first occurence of the pattern (index 0). Use the index kwarg to choose a different | ||
index. | ||
Input | ||
----- | ||
strip: If True, removes leading whitespace from the extracted string | ||
""" | ||
regex_match = _get_indexed_match(text, pattern, index) | ||
_, end = regex_match.span() | ||
before_text = text[end:] | ||
return before_text.lstrip() if strip else before_text |