Skip to content

Commit

Permalink
feat: skip unknown file types [DC-1108] (#7542)
Browse files Browse the repository at this point in the history
Co-authored-by: Marie-Luise Klaus <[email protected]>
  • Loading branch information
faymarie and Marie-Luise Klaus authored Apr 12, 2024
1 parent dab58ec commit 8f6f4fc
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 1 deletion.
16 changes: 15 additions & 1 deletion haystack/nodes/file_classifier/file_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ class FileTypeClassifier(BaseComponent):

outgoing_edges = len(DEFAULT_TYPES)

def __init__(self, supported_types: Optional[List[str]] = None, full_analysis: bool = False):
def __init__(
self, supported_types: Optional[List[str]] = None, full_analysis: bool = False, raise_on_error: bool = True
):
"""
Node that sends out files on a different output edge depending on their extension.
Expand All @@ -35,9 +37,11 @@ def __init__(self, supported_types: Optional[List[str]] = None, full_analysis: b
You can't use lists with duplicate elements.
:param full_analysis: If True, the whole file is analyzed to determine the file type.
If False, only the first 2049 bytes are analyzed.
:param raise_on_error: If True, the node will raise an exception if the file type is not supported.
"""
self.full_analysis = full_analysis
self._default_types = False
self._raise_on_error = raise_on_error
if supported_types is None:
self._default_types = True
supported_types = DEFAULT_TYPES
Expand Down Expand Up @@ -121,6 +125,16 @@ def run(self, file_paths: Union[Path, List[Path], str, List[str], List[Union[Pat
try:
index = self.supported_types.index(extension) + 1
except ValueError:
if self._raise_on_error is False:
logger.warning(
"Unsupported files of type '%s' (%s) found. "
"Unsupported file types will be ignored during indexing as `raise_on_error` is set to `False`. "
"The supported types are: %s. ",
extension,
paths[0],
self.supported_types,
)
return None, None
raise ValueError(
f"Files of type '{extension}' ({paths[0]}) are not supported. "
f"The supported types are: {self.supported_types}. "
Expand Down
15 changes: 15 additions & 0 deletions test/nodes/test_filetype_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,18 @@ def test_filetype_classifier_batched_same_media_extensions(tmp_path):
output, edge = node.run_batch(test_files)
assert edge == "output_1"
assert output == {"file_paths": test_files}


@pytest.mark.unit
@pytest.mark.parametrize("file_type", ["csv", "json", "xml", "pptx", "xlsx"])
def test_filetype_classifier_raise_on_error_disabled_unsupported_file_types(tmp_path, caplog, file_type):
node = FileTypeClassifier(raise_on_error=False)
test_file = tmp_path / f"test.{file_type}"
caplog.clear()
with caplog.at_level(logging.WARNING):
output, edge = node.run(test_file)
assert edge == output == None
assert (
f"Unsupported files of type '{file_type}' ({test_file!s}) found. Unsupported file types will be ignored"
in caplog.text
)

0 comments on commit 8f6f4fc

Please sign in to comment.