From 6b5d8a9785cd9103963016e98b8f5bc11131c5a1 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Wed, 29 May 2024 12:14:22 -0700 Subject: [PATCH 01/13] fix: revert dropping of filename extension for some connectors (#3109) V2 refactor of ingest code introduces the removal of original file extensions. Since the upgrade of connectors is incomplete this means that some connectors will remove the original file extension and some will not. Still TBD whether this is actually something we want at all. This PR reverts specifically that change in the V2 ingest code so that original file extension is preserved downstream. ## Testing CI is passing with filenames updated via `Ingest Test Fixtures Update` workflow. --------- Co-authored-by: ryannikolaidis --- CHANGELOG.md | 10 ++++++++++ ...nd-peace-1p.json => book-war-and-peace-1p.txt.json} | 0 ...nd-peace-1p.json => book-war-and-peace-1p.txt.json} | 0 ...nd-peace-1p.json => book-war-and-peace-1p.txt.json} | 0 .../{handbook-1p.json => handbook-1p.docx.json} | 0 .../{multi-column-2p.json => multi-column-2p.pdf.json} | 0 ...ake-html-cp1252.json => fake-html-cp1252.html.json} | 0 ...le.json => layout-parser-paper-with-table.jpg.json} | 0 ...-parser-paper.json => layout-parser-paper.pdf.json} | 0 ...rticle_all.json => UDHR_first_article_all.txt.json} | 0 ...ls-for-Biomedical-Data-Scientists-2-pages.pdf.json} | 0 .../{IRS-form-1987.json => IRS-form-1987.pdf.json} | 0 .../{main.PMC6312790.json => main.PMC6312790.pdf.json} | 0 .../{main.PMC6312793.json => main.PMC6312793.pdf.json} | 0 ...PMC7234218.json => sbaa031.073.PMC7234218.pdf.json} | 0 ...outlook.json => 2023-Jan-economic-outlook.pdf.json} | 0 ...Silent-Giant-(1).json => Silent-Giant-(1).pdf.json} | 0 ...ge-with-formula.json => page-with-formula.pdf.json} | 0 ...-report.json => recalibrating-risk-report.pdf.json} | 0 ...lots_small.json => wiki_movie_plots_small.csv.json} | 0 ...outlook.json => 2023-Jan-economic-outlook.pdf.json} | 0 ...Silent-Giant-(1).json => Silent-Giant-(1).pdf.json} | 0 ...ge-with-formula.json => page-with-formula.pdf.json} | 0 ...-report.json => recalibrating-risk-report.pdf.json} | 0 unstructured/__version__.py | 2 +- unstructured/ingest/v2/pipeline/steps/chunk.py | 2 +- unstructured/ingest/v2/pipeline/steps/embed.py | 2 +- unstructured/ingest/v2/pipeline/steps/partition.py | 2 +- .../ingest/v2/processes/connectors/fsspec/fsspec.py | 2 +- unstructured/ingest/v2/processes/connectors/local.py | 4 +--- 30 files changed, 16 insertions(+), 8 deletions(-) rename test_unstructured_ingest/expected-structured-output/embed-bedrock/{book-war-and-peace-1p.json => book-war-and-peace-1p.txt.json} (100%) rename test_unstructured_ingest/expected-structured-output/embed-vertexai/{book-war-and-peace-1p.json => book-war-and-peace-1p.txt.json} (100%) rename test_unstructured_ingest/expected-structured-output/embed/{book-war-and-peace-1p.json => book-war-and-peace-1p.txt.json} (100%) rename test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/{handbook-1p.json => handbook-1p.docx.json} (100%) rename test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/{multi-column-2p.json => multi-column-2p.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/{fake-html-cp1252.json => fake-html-cp1252.html.json} (100%) rename test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/{layout-parser-paper-with-table.json => layout-parser-paper-with-table.jpg.json} (100%) rename test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/{layout-parser-paper.json => layout-parser-paper.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/local-single-file/{UDHR_first_article_all.json => UDHR_first_article_all.txt.json} (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/{Core-Skills-for-Biomedical-Data-Scientists-2-pages.json => Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/{IRS-form-1987.json => IRS-form-1987.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/{main.PMC6312790.json => main.PMC6312790.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/{main.PMC6312793.json => main.PMC6312793.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/{sbaa031.073.PMC7234218.json => sbaa031.073.PMC7234218.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/{2023-Jan-economic-outlook.json => 2023-Jan-economic-outlook.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/{Silent-Giant-(1).json => Silent-Giant-(1).pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/{page-with-formula.json => page-with-formula.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/{recalibrating-risk-report.json => recalibrating-risk-report.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/s3-minio/{wiki_movie_plots_small.json => wiki_movie_plots_small.csv.json} (100%) rename test_unstructured_ingest/expected-structured-output/s3/{2023-Jan-economic-outlook.json => 2023-Jan-economic-outlook.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/s3/{Silent-Giant-(1).json => Silent-Giant-(1).pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/s3/{page-with-formula.json => page-with-formula.pdf.json} (100%) rename test_unstructured_ingest/expected-structured-output/s3/{recalibrating-risk-report.json => recalibrating-risk-report.pdf.json} (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70fb72e51b..6505b42d85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.14.4-dev + +### Enhancements + +### Features + +### Fixes + +* **Ingest preserves original file extension** Ingest V2 introduced a change that dropped the original extension for upgraded connectors. This reverts that change. + ## 0.14.3 ### Enhancements diff --git a/test_unstructured_ingest/expected-structured-output/embed-bedrock/book-war-and-peace-1p.json b/test_unstructured_ingest/expected-structured-output/embed-bedrock/book-war-and-peace-1p.txt.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/embed-bedrock/book-war-and-peace-1p.json rename to test_unstructured_ingest/expected-structured-output/embed-bedrock/book-war-and-peace-1p.txt.json diff --git a/test_unstructured_ingest/expected-structured-output/embed-vertexai/book-war-and-peace-1p.json b/test_unstructured_ingest/expected-structured-output/embed-vertexai/book-war-and-peace-1p.txt.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/embed-vertexai/book-war-and-peace-1p.json rename to test_unstructured_ingest/expected-structured-output/embed-vertexai/book-war-and-peace-1p.txt.json diff --git a/test_unstructured_ingest/expected-structured-output/embed/book-war-and-peace-1p.json b/test_unstructured_ingest/expected-structured-output/embed/book-war-and-peace-1p.txt.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/embed/book-war-and-peace-1p.json rename to test_unstructured_ingest/expected-structured-output/embed/book-war-and-peace-1p.txt.json diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.json b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.json rename to test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.json rename to test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.json rename to test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.json rename to test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.json rename to test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.json b/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.json rename to test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/2023-Jan-economic-outlook.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/2023-Jan-economic-outlook.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/2023-Jan-economic-outlook.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/2023-Jan-economic-outlook.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/Silent-Giant-(1).json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/Silent-Giant-(1).pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/Silent-Giant-(1).json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/Silent-Giant-(1).pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/page-with-formula.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/page-with-formula.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/page-with-formula.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/page-with-formula.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/recalibrating-risk-report.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/recalibrating-risk-report.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/recalibrating-risk-report.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/recalibrating-risk-report.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.json b/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.json rename to test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json diff --git a/test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.json b/test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.json rename to test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).json b/test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).json rename to test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/s3/page-with-formula.json b/test_unstructured_ingest/expected-structured-output/s3/page-with-formula.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/s3/page-with-formula.json rename to test_unstructured_ingest/expected-structured-output/s3/page-with-formula.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.json b/test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.json rename to test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 541dd86bd1..41a505b633 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.3" # pragma: no cover +__version__ = "0.14.4-dev" # pragma: no cover diff --git a/unstructured/ingest/v2/pipeline/steps/chunk.py b/unstructured/ingest/v2/pipeline/steps/chunk.py index fc31179a46..d8a4506b03 100644 --- a/unstructured/ingest/v2/pipeline/steps/chunk.py +++ b/unstructured/ingest/v2/pipeline/steps/chunk.py @@ -43,7 +43,7 @@ def should_chunk(self, filepath: Path, file_data: FileData) -> bool: return False def get_output_filepath(self, filename: Path) -> Path: - hashed_output_file = f"{self.get_hash(extras=[filename.stem])}.json" + hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json" filepath = (self.cache_dir / hashed_output_file).resolve() filepath.parent.mkdir(parents=True, exist_ok=True) return filepath diff --git a/unstructured/ingest/v2/pipeline/steps/embed.py b/unstructured/ingest/v2/pipeline/steps/embed.py index 32f818476c..7dcb94ae4e 100644 --- a/unstructured/ingest/v2/pipeline/steps/embed.py +++ b/unstructured/ingest/v2/pipeline/steps/embed.py @@ -43,7 +43,7 @@ def should_embed(self, filepath: Path, file_data: FileData) -> bool: return False def get_output_filepath(self, filename: Path) -> Path: - hashed_output_file = f"{self.get_hash(extras=[filename.stem])}.json" + hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json" filepath = (self.cache_dir / hashed_output_file).resolve() filepath.parent.mkdir(parents=True, exist_ok=True) return filepath diff --git a/unstructured/ingest/v2/pipeline/steps/partition.py b/unstructured/ingest/v2/pipeline/steps/partition.py index 4b53627f9c..96bcb9b87c 100644 --- a/unstructured/ingest/v2/pipeline/steps/partition.py +++ b/unstructured/ingest/v2/pipeline/steps/partition.py @@ -38,7 +38,7 @@ def should_partition(self, filepath: Path, file_data: FileData) -> bool: return False def get_output_filepath(self, filename: Path) -> Path: - hashed_output_file = f"{self.get_hash(extras=[filename.stem])}.json" + hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json" filepath = (self.cache_dir / hashed_output_file).resolve() filepath.parent.mkdir(parents=True, exist_ok=True) return filepath diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py b/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py index b9cd74aa01..5f2a1fb544 100644 --- a/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py +++ b/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py @@ -308,7 +308,7 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non Path(self.upload_config.path_without_protocol) / file_data.source_identifiers.relative_path ) - updated_upload_path = upload_path.parent / f"{upload_path.stem}.json" + updated_upload_path = upload_path.parent / f"{upload_path.name}.json" upload_path_str = str(updated_upload_path) path_str = str(path.resolve()) if self.fs.exists(path=upload_path_str) and not self.upload_config.overwrite: diff --git a/unstructured/ingest/v2/processes/connectors/local.py b/unstructured/ingest/v2/processes/connectors/local.py index faf38b7e7f..00e7a4ab84 100644 --- a/unstructured/ingest/v2/processes/connectors/local.py +++ b/unstructured/ingest/v2/processes/connectors/local.py @@ -149,9 +149,7 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None: for content in contents: identifiers = content.file_data.source_identifiers new_path = self.upload_config.output_path / identifiers.relative_path - final_path = str(new_path).replace( - identifiers.filename, f"{identifiers.filename_stem}.json" - ) + final_path = str(new_path).replace(identifiers.filename, f"{identifiers.filename}.json") Path(final_path).parent.mkdir(parents=True, exist_ok=True) logger.debug(f"copying file from {content.path} to {final_path}") shutil.copy(src=str(content.path), dst=str(final_path)) From 2ecaf5e38c4996dfe53abbc4c6a4f74c8a90b25d Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 29 May 2024 16:41:32 -0400 Subject: [PATCH 02/13] fix: remove 404 from docs (#3112) ### Summary Removes 404 from the docs build to avoid rate limiting behavior. --- docs/source/404.rst | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 docs/source/404.rst diff --git a/docs/source/404.rst b/docs/source/404.rst deleted file mode 100644 index 51db004bc7..0000000000 --- a/docs/source/404.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. _404: - -404 Error -========= - -.. raw:: html - - - From f2e67539b1c02d95b2c04f83533ff07af14b0a91 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Wed, 29 May 2024 14:36:05 -0700 Subject: [PATCH 03/13] rfctr: clean MSG partitioner and tests as prep (#3107) **Summary** Fix type errors and generally prepare `partition_msg()` and its tests for refactoring to use `python-oxmsg` library instead of the problematic `msg_parser` library for partitioning Outlook MSG files. --- CHANGELOG.md | 2 +- test_unstructured/partition/test_doc.py | 2 +- test_unstructured/partition/test_msg.py | 94 +++++++++++-------------- test_unstructured/unit_utils.py | 3 +- typings/msg_parser/__init__.pyi | 16 +++++ typings/pptx/oxml/xmlchemy.pyi | 2 +- unstructured/__version__.py | 2 +- unstructured/partition/msg.py | 13 ++-- 8 files changed, 72 insertions(+), 62 deletions(-) create mode 100644 typings/msg_parser/__init__.pyi diff --git a/CHANGELOG.md b/CHANGELOG.md index 6505b42d85..60d35ec560 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.4-dev +## 0.14.4-dev1 ### Enhancements diff --git a/test_unstructured/partition/test_doc.py b/test_unstructured/partition/test_doc.py index f7dc9122b1..9531de8f5c 100644 --- a/test_unstructured/partition/test_doc.py +++ b/test_unstructured/partition/test_doc.py @@ -35,7 +35,7 @@ is_in_docker = os.path.exists("/.dockerenv") -def test_partition_doc_matches_partition_docx(request): +def test_partition_doc_matches_partition_docx(request: FixtureRequest): # NOTE(robinson) - was having issues with the tempfile not being found in the docker tests if is_in_docker: request.applymarker(pytest.mark.xfail) diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index f1ae890a37..f6e9ded9ca 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -1,9 +1,19 @@ -import os +"""Test suite for `unstructured.partition.msg` module.""" + +from __future__ import annotations + +import pathlib import msg_parser import pytest +from pytest_mock import MockFixture -from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path +from test_unstructured.unit_utils import ( + LogCaptureFixture, + MonkeyPatch, + assert_round_trips_through_JSON, + example_doc_path, +) from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ( ElementMetadata, @@ -73,12 +83,12 @@ def test_partition_msg_from_filename_with_metadata_filename(): class MockMsOxMessage: - def __init__(self, filename): + def __init__(self, filename: str): self.body = "Here is an email with plain text." self.header_dict = {"Content-Type": "text/plain"} -def test_partition_msg_from_filename_with_text_content(monkeypatch): +def test_partition_msg_from_filename_with_text_content(monkeypatch: MonkeyPatch): monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage) filename = example_doc_path("fake-email.msg") elements = partition_msg(filename=filename) @@ -146,15 +156,11 @@ def test_partition_msg_from_file_exclude_metadata(): assert elements[i].metadata.to_dict() == {} -def test_partition_msg_can_process_attachments( - tmpdir, - filename="example-docs/fake-email-attachment.msg", -): - extract_msg_attachment_info(filename=filename, output_dir=tmpdir.dirname) - attachment_filename = os.path.join( - tmpdir.dirname, - ATTACH_EXPECTED_OUTPUT[0]["filename"], - ) +def test_partition_msg_can_process_attachments(tmp_path: pathlib.Path): + file_path = example_doc_path("fake-email-attachment.msg") + tmp_dir_path = str(tmp_path) + extract_msg_attachment_info(filename=file_path, output_dir=tmp_dir_path) + attachment_filename = str(tmp_path / str(ATTACH_EXPECTED_OUTPUT[0]["filename"])) mocked_last_modification_date = "2029-07-05T09:24:28" @@ -165,10 +171,10 @@ def test_partition_msg_can_process_attachments( ) expected_metadata = attachment_elements[0].metadata expected_metadata.file_directory = None - expected_metadata.attached_to_filename = filename + expected_metadata.attached_to_filename = file_path elements = partition_msg( - filename=filename, + filename=file_path, attachment_partitioner=partition_text, process_attachments=True, metadata_last_modified=mocked_last_modification_date, @@ -187,15 +193,11 @@ def test_partition_msg_can_process_attachments( assert elements[-1].metadata == expected_metadata -def test_partition_msg_can_process_min_max_wtih_attachments( - tmpdir, - filename="example-docs/fake-email-attachment.msg", -): - extract_msg_attachment_info(filename=filename, output_dir=tmpdir.dirname) - attachment_filename = os.path.join( - tmpdir.dirname, - ATTACH_EXPECTED_OUTPUT[0]["filename"], - ) +def test_partition_msg_can_process_min_max_wtih_attachments(tmp_path: pathlib.Path): + file_path = example_doc_path("fake-email-attachment.msg") + tmp_dir_path = str(tmp_path) + extract_msg_attachment_info(filename=file_path, output_dir=tmp_dir_path) + attachment_filename = str(tmp_path / str(ATTACH_EXPECTED_OUTPUT[0]["filename"])) attachment_elements = partition_text( filename=attachment_filename, @@ -205,7 +207,7 @@ def test_partition_msg_can_process_min_max_wtih_attachments( ) elements = partition_msg( - filename=filename, + filename=file_path, attachment_partitioner=partition_text, process_attachments=True, min_partition=6, @@ -221,17 +223,12 @@ def test_partition_msg_can_process_min_max_wtih_attachments( assert len(element.text) >= 6 -def test_partition_msg_raises_with_no_partitioner( - filename="example-docs/fake-email-attachment.msg", -): +def test_partition_msg_raises_with_no_partitioner(): with pytest.raises(ValueError): - partition_msg(filename=filename, process_attachments=True) + partition_msg(example_doc_path("fake-email-attachment.msg"), process_attachments=True) -def test_partition_msg_metadata_date_from_header( - mocker, - filename="example-docs/fake-email.msg", -): +def test_partition_msg_metadata_date_from_header(mocker: MockFixture): expected_last_modification_date = "2022-12-16T17:04:16-05:00" mocker.patch( @@ -243,33 +240,25 @@ def test_partition_msg_metadata_date_from_header( return_value=None, ) - elements = partition_msg(filename=filename) + elements = partition_msg(example_doc_path("fake-email.msg")) assert elements[0].metadata.last_modified == expected_last_modification_date -def test_partition_msg_from_file_custom_metadata_date( - filename="example-docs/fake-email.msg", -): +def test_partition_msg_from_file_custom_metadata_date(): expected_last_modification_date = "2020-07-05T09:24:28" - with open(filename, "rb") as f: - elements = partition_msg( - file=f, - metadata_last_modified=expected_last_modification_date, - ) + with open(example_doc_path("fake-email.msg"), "rb") as f: + elements = partition_msg(file=f, metadata_last_modified=expected_last_modification_date) assert elements[0].metadata.last_modified == expected_last_modification_date -def test_partition_msg_custom_metadata_date( - filename="example-docs/fake-email.msg", -): +def test_partition_msg_custom_metadata_date(): expected_last_modification_date = "2020-07-05T09:24:28" elements = partition_msg( - filename=filename, - metadata_last_modified=expected_last_modification_date, + example_doc_path("fake-email.msg"), metadata_last_modified=expected_last_modification_date ) assert elements[0].metadata.last_modified == expected_last_modification_date @@ -280,7 +269,7 @@ def test_partition_msg_with_json(): assert_round_trips_through_JSON(elements) -def test_partition_msg_with_pgp_encrypted_message(caplog): +def test_partition_msg_with_pgp_encrypted_message(caplog: LogCaptureFixture): elements = partition_msg(example_doc_path("fake-encrypted.msg")) assert elements == [] @@ -288,12 +277,13 @@ def test_partition_msg_with_pgp_encrypted_message(caplog): assert "Encrypted email detected" in caplog.text -def test_add_chunking_strategy_by_title_on_partition_msg( - filename=example_doc_path("fake-email.msg"), -): +def test_add_chunking_strategy_by_title_on_partition_msg(): + filename = example_doc_path("fake-email.msg") + elements = partition_msg(filename=filename) chunk_elements = partition_msg(filename, chunking_strategy="by_title") chunks = chunk_by_title(elements) + assert chunk_elements != elements assert chunk_elements == chunks @@ -313,4 +303,4 @@ def test_partition_msg_respects_languages_arg(): def test_partition_msg_raises_TypeError_for_invalid_languages(): with pytest.raises(TypeError): filename = "example-docs/fake-email.msg" - partition_msg(filename=filename, languages="eng") + partition_msg(filename=filename, languages="eng") # pyright: ignore[reportArgumentType] diff --git a/test_unstructured/unit_utils.py b/test_unstructured/unit_utils.py index 237443c521..bd7a7751ff 100644 --- a/test_unstructured/unit_utils.py +++ b/test_unstructured/unit_utils.py @@ -17,7 +17,7 @@ patch, ) -from pytest import CaptureFixture, FixtureRequest, LogCaptureFixture # noqa: PT013 +from pytest import CaptureFixture, FixtureRequest, LogCaptureFixture, MonkeyPatch # noqa: PT013 from unstructured.documents.elements import Element from unstructured.staging.base import elements_from_json, elements_to_json @@ -29,6 +29,7 @@ "LogCaptureFixture", "MagicMock", "Mock", + "MonkeyPatch", "call", "class_mock", "function_mock", diff --git a/typings/msg_parser/__init__.pyi b/typings/msg_parser/__init__.pyi new file mode 100644 index 0000000000..86a39a6f40 --- /dev/null +++ b/typings/msg_parser/__init__.pyi @@ -0,0 +1,16 @@ +from __future__ import annotations + +from typing import Any + +class MsOxMessage: + attachments: list[Attachment] + body: str | None + header_dict: dict[str, Any] + + def __init__(self, msg_file_path: str) -> None: ... + +class Attachment: + AttachExtension: str | None + AttachLongFilename: str | None + AttachmentSize: int | None + data: bytes diff --git a/typings/pptx/oxml/xmlchemy.pyi b/typings/pptx/oxml/xmlchemy.pyi index e08277ee68..c8d9dd31b9 100644 --- a/typings/pptx/oxml/xmlchemy.pyi +++ b/typings/pptx/oxml/xmlchemy.pyi @@ -6,7 +6,7 @@ class BaseOxmlElement(etree.ElementBase): def __iter__(self) -> Iterator[BaseOxmlElement]: ... @property def xml(self) -> str: ... - def xpath(self, xpath_str: str) -> Any: + def xpath(self, xpath_str: str) -> Any: # pyright: ignore[reportIncompatibleMethodOverride] """Return type is typically Sequence[ElementBase], but ... lxml.etree.XPath has many possible return types including bool, (a "smart") str, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 41a505b633..23bad8442f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.4-dev" # pragma: no cover +__version__ = "0.14.4-dev1" # pragma: no cover diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 78db529d48..fe1d7f0a25 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -79,7 +79,9 @@ def partition_msg( if filename is not None: msg_obj = msg_parser.MsOxMessage(filename) - elif file is not None: + # -- `exactly_one()` call above guarantees `file` is present when `filename` is None -- + else: + assert file is not None tmp = tempfile.NamedTemporaryFile(delete=False) tmp.write(file.read()) tmp.close() @@ -213,13 +215,14 @@ def extract_msg_attachment_info( tmp.write(file.read()) tmp.close() msg_obj = msg_parser.MsOxMessage(tmp.name) - elif msg_obj is not None: + else: + assert msg_obj is not None msg_obj = msg_obj - list_attachments = [] + list_attachments: list[dict[str, Any]] = [] for attachment in msg_obj.attachments: - attachment_info = {} + attachment_info: dict[str, Any] = {} attachment_info["filename"] = attachment.AttachLongFilename attachment_info["extension"] = attachment.AttachExtension @@ -229,7 +232,7 @@ def extract_msg_attachment_info( list_attachments.append(attachment_info) if output_dir is not None: - output_filename = output_dir + "/" + attachment_info["filename"] + output_filename = output_dir + "/" + (attachment_info["filename"] or "unknown") with open(output_filename, "wb") as f: f.write(attachment.data) From 8415db51129b6aea194320711503348326d1fc68 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 30 May 2024 07:46:38 -0400 Subject: [PATCH 04/13] docs: make 404 pages same as index (#3114) ### Summary Makes a custom 404 page that's the same as `index.html`, so any path shows the URL for the new docs. --- docs/source/404.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/source/404.rst diff --git a/docs/source/404.rst b/docs/source/404.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/404.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. From 9acf26ec2e9258f610c8288d9b1d8ba9316edb10 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 30 May 2024 09:01:33 -0400 Subject: [PATCH 05/13] docs: explicitly replace all old pages with link to new docs (#3118) ### Summary Explicitly replaces all old docs pages with a link to the new docs. This was required because 404 redirects didn't work for pages that previously existed, though they worked non-existing paths that never existed. --- .github/workflows/ci.yml | 3 ++- docs/source/api.rst | 6 ++++++ docs/source/apis/api_parameters.rst | 6 ++++++ docs/source/apis/api_sdks.rst | 6 ++++++ docs/source/apis/aws_marketplace.rst | 6 ++++++ docs/source/apis/azure_marketplace.rst | 6 ++++++ docs/source/apis/saas_api.rst | 6 ++++++ docs/source/apis/usage_methods.rst | 6 ++++++ docs/source/apis/validation_errors.rst | 6 ++++++ docs/source/best_practices.rst | 6 ++++++ docs/source/best_practices/models.rst | 6 ++++++ docs/source/best_practices/strategies.rst | 6 ++++++ docs/source/best_practices/table_extraction_pdf.rst | 6 ++++++ docs/source/core.rst | 6 ++++++ docs/source/core/chunking.rst | 6 ++++++ docs/source/core/cleaning.rst | 6 ++++++ docs/source/core/embedding.rst | 6 ++++++ docs/source/core/extracting.rst | 6 ++++++ docs/source/core/partition.rst | 6 ++++++ docs/source/core/staging.rst | 6 ++++++ docs/source/examples.rst | 6 ++++++ docs/source/examples/chroma.rst | 6 ++++++ docs/source/examples/databricks.rst | 6 ++++++ docs/source/examples/dict_to_elements.rst | 6 ++++++ docs/source/ingest/configs.rst | 6 ++++++ docs/source/ingest/configs/chunking_config.rst | 6 ++++++ docs/source/ingest/configs/embedding_config.rst | 6 ++++++ docs/source/ingest/configs/fsspec_config.rst | 6 ++++++ docs/source/ingest/configs/partition_config.rst | 6 ++++++ docs/source/ingest/configs/permissions_config.rst | 6 ++++++ docs/source/ingest/configs/processor_config.rst | 6 ++++++ docs/source/ingest/configs/read_config.rst | 6 ++++++ docs/source/ingest/configs/retry_strategy_config.rst | 6 ++++++ docs/source/ingest/destination_connectors.rst | 6 ++++++ docs/source/ingest/destination_connectors/astra.rst | 6 ++++++ docs/source/ingest/destination_connectors/azure.rst | 6 ++++++ .../destination_connectors/azure_cognitive_search.rst | 6 ++++++ docs/source/ingest/destination_connectors/box.rst | 6 ++++++ docs/source/ingest/destination_connectors/chroma.rst | 6 ++++++ docs/source/ingest/destination_connectors/clarifai.rst | 6 ++++++ .../ingest/destination_connectors/databricks_volumes.rst | 6 ++++++ docs/source/ingest/destination_connectors/delta_table.rst | 6 ++++++ docs/source/ingest/destination_connectors/dropbox.rst | 6 ++++++ docs/source/ingest/destination_connectors/elasticsearch.rst | 6 ++++++ docs/source/ingest/destination_connectors/gcs.rst | 6 ++++++ docs/source/ingest/destination_connectors/mongodb.rst | 6 ++++++ docs/source/ingest/destination_connectors/opensearch.rst | 6 ++++++ docs/source/ingest/destination_connectors/pinecone.rst | 6 ++++++ docs/source/ingest/destination_connectors/qdrant.rst | 6 ++++++ docs/source/ingest/destination_connectors/s3.rst | 6 ++++++ docs/source/ingest/destination_connectors/sql.rst | 6 ++++++ docs/source/ingest/destination_connectors/vectara.rst | 6 ++++++ docs/source/ingest/destination_connectors/weaviate.rst | 6 ++++++ docs/source/ingest/index.rst | 6 ++++++ docs/source/ingest/source_connectors.rst | 6 ++++++ docs/source/ingest/source_connectors/airtable.rst | 6 ++++++ docs/source/ingest/source_connectors/azure.rst | 6 ++++++ docs/source/ingest/source_connectors/biomed.rst | 6 ++++++ docs/source/ingest/source_connectors/box.rst | 6 ++++++ docs/source/ingest/source_connectors/confluence.rst | 6 ++++++ docs/source/ingest/source_connectors/delta_table.rst | 6 ++++++ docs/source/ingest/source_connectors/discord.rst | 6 ++++++ docs/source/ingest/source_connectors/dropbox.rst | 6 ++++++ docs/source/ingest/source_connectors/elasticsearch.rst | 6 ++++++ docs/source/ingest/source_connectors/github.rst | 6 ++++++ docs/source/ingest/source_connectors/gitlab.rst | 6 ++++++ .../ingest/source_connectors/google_cloud_storage.rst | 6 ++++++ docs/source/ingest/source_connectors/google_drive.rst | 6 ++++++ docs/source/ingest/source_connectors/jira.rst | 6 ++++++ docs/source/ingest/source_connectors/local_connector.rst | 6 ++++++ docs/source/ingest/source_connectors/mongodb.rst | 6 ++++++ docs/source/ingest/source_connectors/notion.rst | 6 ++++++ docs/source/ingest/source_connectors/onedrive.rst | 6 ++++++ docs/source/ingest/source_connectors/opensearch.rst | 6 ++++++ docs/source/ingest/source_connectors/outlook.rst | 6 ++++++ docs/source/ingest/source_connectors/reddit.rst | 6 ++++++ docs/source/ingest/source_connectors/s3.rst | 6 ++++++ docs/source/ingest/source_connectors/salesforce.rst | 6 ++++++ docs/source/ingest/source_connectors/sftp.rst | 6 ++++++ docs/source/ingest/source_connectors/sharepoint.rst | 6 ++++++ docs/source/ingest/source_connectors/slack.rst | 6 ++++++ docs/source/ingest/source_connectors/wikipedia.rst | 6 ++++++ docs/source/installation/docker.rst | 6 ++++++ docs/source/installation/full_installation.rst | 6 ++++++ docs/source/installing.rst | 6 ++++++ docs/source/integrations.rst | 6 ++++++ docs/source/introduction.rst | 6 ++++++ docs/source/introduction/getting_started.rst | 6 ++++++ docs/source/introduction/key_concepts.rst | 6 ++++++ docs/source/introduction/overview.rst | 6 ++++++ docs/source/metadata.rst | 6 ++++++ docs/source/platform.rst | 6 ++++++ docs/source/platforms/destination_platform.rst | 6 ++++++ docs/source/platforms/job.rst | 6 ++++++ .../platform_destinations/amazon_s3_destination.rst | 6 ++++++ .../platform_destinations/azure_cognitive_search.rst | 6 ++++++ docs/source/platforms/platform_destinations/chroma.rst | 6 ++++++ docs/source/platforms/platform_destinations/databricks.rst | 6 ++++++ .../platform_destinations/elasticsearch_destination.rst | 6 ++++++ .../platform_destinations/google_cloud_destination.rst | 6 ++++++ docs/source/platforms/platform_destinations/mongodb.rst | 6 ++++++ docs/source/platforms/platform_destinations/opensearch.rst | 6 ++++++ docs/source/platforms/platform_destinations/pinecone.rst | 6 ++++++ docs/source/platforms/platform_destinations/postgresql.rst | 6 ++++++ docs/source/platforms/platform_destinations/weaviate.rst | 6 ++++++ docs/source/platforms/platform_sources/amazon_s3_source.rst | 6 ++++++ docs/source/platforms/platform_sources/azure_blob.rst | 6 ++++++ .../platforms/platform_sources/elasticsearch_source.rst | 6 ++++++ .../platforms/platform_sources/google_cloud_source.rst | 6 ++++++ docs/source/platforms/platform_sources/google_drive.rst | 6 ++++++ docs/source/platforms/platform_sources/onedrive.rst | 6 ++++++ docs/source/platforms/platform_sources/opensearch.rst | 6 ++++++ docs/source/platforms/platform_sources/salesforce.rst | 6 ++++++ docs/source/platforms/platform_sources/sftp.rst | 6 ++++++ docs/source/platforms/platform_sources/sharepoint.rst | 6 ++++++ docs/source/platforms/source_platform.rst | 6 ++++++ docs/source/platforms/workflow.rst | 6 ++++++ 117 files changed, 698 insertions(+), 1 deletion(-) create mode 100644 docs/source/api.rst create mode 100644 docs/source/apis/api_parameters.rst create mode 100644 docs/source/apis/api_sdks.rst create mode 100644 docs/source/apis/aws_marketplace.rst create mode 100644 docs/source/apis/azure_marketplace.rst create mode 100644 docs/source/apis/saas_api.rst create mode 100644 docs/source/apis/usage_methods.rst create mode 100644 docs/source/apis/validation_errors.rst create mode 100644 docs/source/best_practices.rst create mode 100644 docs/source/best_practices/models.rst create mode 100644 docs/source/best_practices/strategies.rst create mode 100644 docs/source/best_practices/table_extraction_pdf.rst create mode 100644 docs/source/core.rst create mode 100644 docs/source/core/chunking.rst create mode 100644 docs/source/core/cleaning.rst create mode 100644 docs/source/core/embedding.rst create mode 100644 docs/source/core/extracting.rst create mode 100644 docs/source/core/partition.rst create mode 100644 docs/source/core/staging.rst create mode 100644 docs/source/examples.rst create mode 100644 docs/source/examples/chroma.rst create mode 100644 docs/source/examples/databricks.rst create mode 100644 docs/source/examples/dict_to_elements.rst create mode 100644 docs/source/ingest/configs.rst create mode 100644 docs/source/ingest/configs/chunking_config.rst create mode 100644 docs/source/ingest/configs/embedding_config.rst create mode 100644 docs/source/ingest/configs/fsspec_config.rst create mode 100644 docs/source/ingest/configs/partition_config.rst create mode 100644 docs/source/ingest/configs/permissions_config.rst create mode 100644 docs/source/ingest/configs/processor_config.rst create mode 100644 docs/source/ingest/configs/read_config.rst create mode 100644 docs/source/ingest/configs/retry_strategy_config.rst create mode 100644 docs/source/ingest/destination_connectors.rst create mode 100644 docs/source/ingest/destination_connectors/astra.rst create mode 100644 docs/source/ingest/destination_connectors/azure.rst create mode 100644 docs/source/ingest/destination_connectors/azure_cognitive_search.rst create mode 100644 docs/source/ingest/destination_connectors/box.rst create mode 100644 docs/source/ingest/destination_connectors/chroma.rst create mode 100644 docs/source/ingest/destination_connectors/clarifai.rst create mode 100644 docs/source/ingest/destination_connectors/databricks_volumes.rst create mode 100644 docs/source/ingest/destination_connectors/delta_table.rst create mode 100644 docs/source/ingest/destination_connectors/dropbox.rst create mode 100644 docs/source/ingest/destination_connectors/elasticsearch.rst create mode 100644 docs/source/ingest/destination_connectors/gcs.rst create mode 100644 docs/source/ingest/destination_connectors/mongodb.rst create mode 100644 docs/source/ingest/destination_connectors/opensearch.rst create mode 100644 docs/source/ingest/destination_connectors/pinecone.rst create mode 100644 docs/source/ingest/destination_connectors/qdrant.rst create mode 100644 docs/source/ingest/destination_connectors/s3.rst create mode 100644 docs/source/ingest/destination_connectors/sql.rst create mode 100644 docs/source/ingest/destination_connectors/vectara.rst create mode 100644 docs/source/ingest/destination_connectors/weaviate.rst create mode 100644 docs/source/ingest/index.rst create mode 100644 docs/source/ingest/source_connectors.rst create mode 100644 docs/source/ingest/source_connectors/airtable.rst create mode 100644 docs/source/ingest/source_connectors/azure.rst create mode 100644 docs/source/ingest/source_connectors/biomed.rst create mode 100644 docs/source/ingest/source_connectors/box.rst create mode 100644 docs/source/ingest/source_connectors/confluence.rst create mode 100644 docs/source/ingest/source_connectors/delta_table.rst create mode 100644 docs/source/ingest/source_connectors/discord.rst create mode 100644 docs/source/ingest/source_connectors/dropbox.rst create mode 100644 docs/source/ingest/source_connectors/elasticsearch.rst create mode 100644 docs/source/ingest/source_connectors/github.rst create mode 100644 docs/source/ingest/source_connectors/gitlab.rst create mode 100644 docs/source/ingest/source_connectors/google_cloud_storage.rst create mode 100644 docs/source/ingest/source_connectors/google_drive.rst create mode 100644 docs/source/ingest/source_connectors/jira.rst create mode 100644 docs/source/ingest/source_connectors/local_connector.rst create mode 100644 docs/source/ingest/source_connectors/mongodb.rst create mode 100644 docs/source/ingest/source_connectors/notion.rst create mode 100644 docs/source/ingest/source_connectors/onedrive.rst create mode 100644 docs/source/ingest/source_connectors/opensearch.rst create mode 100644 docs/source/ingest/source_connectors/outlook.rst create mode 100644 docs/source/ingest/source_connectors/reddit.rst create mode 100644 docs/source/ingest/source_connectors/s3.rst create mode 100644 docs/source/ingest/source_connectors/salesforce.rst create mode 100644 docs/source/ingest/source_connectors/sftp.rst create mode 100644 docs/source/ingest/source_connectors/sharepoint.rst create mode 100644 docs/source/ingest/source_connectors/slack.rst create mode 100644 docs/source/ingest/source_connectors/wikipedia.rst create mode 100644 docs/source/installation/docker.rst create mode 100644 docs/source/installation/full_installation.rst create mode 100644 docs/source/installing.rst create mode 100644 docs/source/integrations.rst create mode 100644 docs/source/introduction.rst create mode 100644 docs/source/introduction/getting_started.rst create mode 100644 docs/source/introduction/key_concepts.rst create mode 100644 docs/source/introduction/overview.rst create mode 100644 docs/source/metadata.rst create mode 100644 docs/source/platform.rst create mode 100644 docs/source/platforms/destination_platform.rst create mode 100644 docs/source/platforms/job.rst create mode 100644 docs/source/platforms/platform_destinations/amazon_s3_destination.rst create mode 100644 docs/source/platforms/platform_destinations/azure_cognitive_search.rst create mode 100644 docs/source/platforms/platform_destinations/chroma.rst create mode 100644 docs/source/platforms/platform_destinations/databricks.rst create mode 100644 docs/source/platforms/platform_destinations/elasticsearch_destination.rst create mode 100644 docs/source/platforms/platform_destinations/google_cloud_destination.rst create mode 100644 docs/source/platforms/platform_destinations/mongodb.rst create mode 100644 docs/source/platforms/platform_destinations/opensearch.rst create mode 100644 docs/source/platforms/platform_destinations/pinecone.rst create mode 100644 docs/source/platforms/platform_destinations/postgresql.rst create mode 100644 docs/source/platforms/platform_destinations/weaviate.rst create mode 100644 docs/source/platforms/platform_sources/amazon_s3_source.rst create mode 100644 docs/source/platforms/platform_sources/azure_blob.rst create mode 100644 docs/source/platforms/platform_sources/elasticsearch_source.rst create mode 100644 docs/source/platforms/platform_sources/google_cloud_source.rst create mode 100644 docs/source/platforms/platform_sources/google_drive.rst create mode 100644 docs/source/platforms/platform_sources/onedrive.rst create mode 100644 docs/source/platforms/platform_sources/opensearch.rst create mode 100644 docs/source/platforms/platform_sources/salesforce.rst create mode 100644 docs/source/platforms/platform_sources/sftp.rst create mode 100644 docs/source/platforms/platform_sources/sharepoint.rst create mode 100644 docs/source/platforms/source_platform.rst create mode 100644 docs/source/platforms/workflow.rst diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b0868a0d2e..b1dc5bfe4f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -504,4 +504,5 @@ jobs: uses: anchore/scan-action@v3 with: image: "unstructured:dev" - severity-cutoff: medium + # NOTE(robinson) - revert this to medium when we bump libreoffice + severity-cutoff: high diff --git a/docs/source/api.rst b/docs/source/api.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/api.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/apis/api_parameters.rst b/docs/source/apis/api_parameters.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/apis/api_parameters.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/apis/api_sdks.rst b/docs/source/apis/api_sdks.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/apis/api_sdks.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/apis/aws_marketplace.rst b/docs/source/apis/aws_marketplace.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/apis/aws_marketplace.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/apis/azure_marketplace.rst b/docs/source/apis/azure_marketplace.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/apis/azure_marketplace.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/apis/saas_api.rst b/docs/source/apis/saas_api.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/apis/saas_api.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/apis/usage_methods.rst b/docs/source/apis/usage_methods.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/apis/usage_methods.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/apis/validation_errors.rst b/docs/source/apis/validation_errors.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/apis/validation_errors.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/best_practices.rst b/docs/source/best_practices.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/best_practices.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/best_practices/models.rst b/docs/source/best_practices/models.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/best_practices/models.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/best_practices/strategies.rst b/docs/source/best_practices/strategies.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/best_practices/strategies.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/best_practices/table_extraction_pdf.rst b/docs/source/best_practices/table_extraction_pdf.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/best_practices/table_extraction_pdf.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/core.rst b/docs/source/core.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/core.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/core/chunking.rst b/docs/source/core/chunking.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/core/chunking.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/core/cleaning.rst b/docs/source/core/cleaning.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/core/cleaning.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/core/embedding.rst b/docs/source/core/embedding.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/core/embedding.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/core/extracting.rst b/docs/source/core/extracting.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/core/extracting.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/core/partition.rst b/docs/source/core/partition.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/core/partition.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/core/staging.rst b/docs/source/core/staging.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/core/staging.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/examples.rst b/docs/source/examples.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/examples.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/examples/chroma.rst b/docs/source/examples/chroma.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/examples/chroma.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/examples/databricks.rst b/docs/source/examples/databricks.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/examples/databricks.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/examples/dict_to_elements.rst b/docs/source/examples/dict_to_elements.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/examples/dict_to_elements.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/configs.rst b/docs/source/ingest/configs.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/configs.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/configs/chunking_config.rst b/docs/source/ingest/configs/chunking_config.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/configs/chunking_config.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/configs/embedding_config.rst b/docs/source/ingest/configs/embedding_config.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/configs/embedding_config.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/configs/fsspec_config.rst b/docs/source/ingest/configs/fsspec_config.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/configs/fsspec_config.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/configs/partition_config.rst b/docs/source/ingest/configs/partition_config.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/configs/partition_config.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/configs/permissions_config.rst b/docs/source/ingest/configs/permissions_config.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/configs/permissions_config.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/configs/processor_config.rst b/docs/source/ingest/configs/processor_config.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/configs/processor_config.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/configs/read_config.rst b/docs/source/ingest/configs/read_config.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/configs/read_config.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/configs/retry_strategy_config.rst b/docs/source/ingest/configs/retry_strategy_config.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/configs/retry_strategy_config.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors.rst b/docs/source/ingest/destination_connectors.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/astra.rst b/docs/source/ingest/destination_connectors/astra.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/astra.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/azure.rst b/docs/source/ingest/destination_connectors/azure.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/azure.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/azure_cognitive_search.rst b/docs/source/ingest/destination_connectors/azure_cognitive_search.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/azure_cognitive_search.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/box.rst b/docs/source/ingest/destination_connectors/box.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/box.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/chroma.rst b/docs/source/ingest/destination_connectors/chroma.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/chroma.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/clarifai.rst b/docs/source/ingest/destination_connectors/clarifai.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/clarifai.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/databricks_volumes.rst b/docs/source/ingest/destination_connectors/databricks_volumes.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/databricks_volumes.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/delta_table.rst b/docs/source/ingest/destination_connectors/delta_table.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/delta_table.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/dropbox.rst b/docs/source/ingest/destination_connectors/dropbox.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/dropbox.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/elasticsearch.rst b/docs/source/ingest/destination_connectors/elasticsearch.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/elasticsearch.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/gcs.rst b/docs/source/ingest/destination_connectors/gcs.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/gcs.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/mongodb.rst b/docs/source/ingest/destination_connectors/mongodb.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/mongodb.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/opensearch.rst b/docs/source/ingest/destination_connectors/opensearch.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/opensearch.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/pinecone.rst b/docs/source/ingest/destination_connectors/pinecone.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/pinecone.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/qdrant.rst b/docs/source/ingest/destination_connectors/qdrant.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/qdrant.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/s3.rst b/docs/source/ingest/destination_connectors/s3.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/s3.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/sql.rst b/docs/source/ingest/destination_connectors/sql.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/sql.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/vectara.rst b/docs/source/ingest/destination_connectors/vectara.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/vectara.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/destination_connectors/weaviate.rst b/docs/source/ingest/destination_connectors/weaviate.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/destination_connectors/weaviate.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/index.rst b/docs/source/ingest/index.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/index.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors.rst b/docs/source/ingest/source_connectors.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/airtable.rst b/docs/source/ingest/source_connectors/airtable.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/airtable.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/azure.rst b/docs/source/ingest/source_connectors/azure.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/azure.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/biomed.rst b/docs/source/ingest/source_connectors/biomed.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/biomed.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/box.rst b/docs/source/ingest/source_connectors/box.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/box.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/confluence.rst b/docs/source/ingest/source_connectors/confluence.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/confluence.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/delta_table.rst b/docs/source/ingest/source_connectors/delta_table.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/delta_table.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/discord.rst b/docs/source/ingest/source_connectors/discord.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/discord.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/dropbox.rst b/docs/source/ingest/source_connectors/dropbox.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/dropbox.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/elasticsearch.rst b/docs/source/ingest/source_connectors/elasticsearch.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/elasticsearch.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/github.rst b/docs/source/ingest/source_connectors/github.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/github.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/gitlab.rst b/docs/source/ingest/source_connectors/gitlab.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/gitlab.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/google_cloud_storage.rst b/docs/source/ingest/source_connectors/google_cloud_storage.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/google_cloud_storage.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/google_drive.rst b/docs/source/ingest/source_connectors/google_drive.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/google_drive.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/jira.rst b/docs/source/ingest/source_connectors/jira.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/jira.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/local_connector.rst b/docs/source/ingest/source_connectors/local_connector.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/local_connector.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/mongodb.rst b/docs/source/ingest/source_connectors/mongodb.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/mongodb.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/notion.rst b/docs/source/ingest/source_connectors/notion.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/notion.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/onedrive.rst b/docs/source/ingest/source_connectors/onedrive.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/onedrive.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/opensearch.rst b/docs/source/ingest/source_connectors/opensearch.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/opensearch.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/outlook.rst b/docs/source/ingest/source_connectors/outlook.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/outlook.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/reddit.rst b/docs/source/ingest/source_connectors/reddit.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/reddit.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/s3.rst b/docs/source/ingest/source_connectors/s3.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/s3.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/salesforce.rst b/docs/source/ingest/source_connectors/salesforce.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/salesforce.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/sftp.rst b/docs/source/ingest/source_connectors/sftp.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/sftp.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/sharepoint.rst b/docs/source/ingest/source_connectors/sharepoint.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/sharepoint.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/slack.rst b/docs/source/ingest/source_connectors/slack.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/slack.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/ingest/source_connectors/wikipedia.rst b/docs/source/ingest/source_connectors/wikipedia.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/ingest/source_connectors/wikipedia.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/installation/docker.rst b/docs/source/installation/docker.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/installation/docker.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/installation/full_installation.rst b/docs/source/installation/full_installation.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/installation/full_installation.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/installing.rst b/docs/source/installing.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/installing.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/integrations.rst b/docs/source/integrations.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/integrations.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/introduction.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/introduction/getting_started.rst b/docs/source/introduction/getting_started.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/introduction/getting_started.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/introduction/key_concepts.rst b/docs/source/introduction/key_concepts.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/introduction/key_concepts.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/introduction/overview.rst b/docs/source/introduction/overview.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/introduction/overview.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/metadata.rst b/docs/source/metadata.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/metadata.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platform.rst b/docs/source/platform.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platform.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/destination_platform.rst b/docs/source/platforms/destination_platform.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/destination_platform.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/job.rst b/docs/source/platforms/job.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/job.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/amazon_s3_destination.rst b/docs/source/platforms/platform_destinations/amazon_s3_destination.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/amazon_s3_destination.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/azure_cognitive_search.rst b/docs/source/platforms/platform_destinations/azure_cognitive_search.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/azure_cognitive_search.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/chroma.rst b/docs/source/platforms/platform_destinations/chroma.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/chroma.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/databricks.rst b/docs/source/platforms/platform_destinations/databricks.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/databricks.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/elasticsearch_destination.rst b/docs/source/platforms/platform_destinations/elasticsearch_destination.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/elasticsearch_destination.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/google_cloud_destination.rst b/docs/source/platforms/platform_destinations/google_cloud_destination.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/google_cloud_destination.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/mongodb.rst b/docs/source/platforms/platform_destinations/mongodb.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/mongodb.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/opensearch.rst b/docs/source/platforms/platform_destinations/opensearch.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/opensearch.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/pinecone.rst b/docs/source/platforms/platform_destinations/pinecone.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/pinecone.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/postgresql.rst b/docs/source/platforms/platform_destinations/postgresql.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/postgresql.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_destinations/weaviate.rst b/docs/source/platforms/platform_destinations/weaviate.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_destinations/weaviate.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_sources/amazon_s3_source.rst b/docs/source/platforms/platform_sources/amazon_s3_source.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_sources/amazon_s3_source.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_sources/azure_blob.rst b/docs/source/platforms/platform_sources/azure_blob.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_sources/azure_blob.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_sources/elasticsearch_source.rst b/docs/source/platforms/platform_sources/elasticsearch_source.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_sources/elasticsearch_source.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_sources/google_cloud_source.rst b/docs/source/platforms/platform_sources/google_cloud_source.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_sources/google_cloud_source.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_sources/google_drive.rst b/docs/source/platforms/platform_sources/google_drive.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_sources/google_drive.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_sources/onedrive.rst b/docs/source/platforms/platform_sources/onedrive.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_sources/onedrive.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_sources/opensearch.rst b/docs/source/platforms/platform_sources/opensearch.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_sources/opensearch.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_sources/salesforce.rst b/docs/source/platforms/platform_sources/salesforce.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_sources/salesforce.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_sources/sftp.rst b/docs/source/platforms/platform_sources/sftp.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_sources/sftp.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/platform_sources/sharepoint.rst b/docs/source/platforms/platform_sources/sharepoint.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/platform_sources/sharepoint.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/source_platform.rst b/docs/source/platforms/source_platform.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/source_platform.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. diff --git a/docs/source/platforms/workflow.rst b/docs/source/platforms/workflow.rst new file mode 100644 index 0000000000..c32a73bbc7 --- /dev/null +++ b/docs/source/platforms/workflow.rst @@ -0,0 +1,6 @@ +Unstructured Documentation +========================== + +The Unstructured documentation page has moved! Check out our new and improved docs page at +`https://docs.unstructured.io `_ to learn more about our +products and tools. From 293901e1447e0c0797382bb44dda67cada32ba97 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Thu, 30 May 2024 10:08:10 -0500 Subject: [PATCH 06/13] build: pin python-docx (#3110) Since we incorporate a newer feature from `python-docx` [here](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/docx.py#L521), we should make the version of `python-docx` that first supports that method an explicit requirement. I didn't pip recompile since our generated dependencies already have `python-docx==1.1.2`, but I can do that if someone thinks it's necessary. --- CHANGELOG.md | 3 ++- requirements/deps/constraints.txt | 4 ---- requirements/extra-docx.in | 3 ++- requirements/extra-odt.in | 3 ++- unstructured/__version__.py | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60d35ec560..73725edd1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.4-dev1 +## 0.14.4-dev2 ### Enhancements @@ -6,6 +6,7 @@ ### Fixes +* **Clarified dependence on particular version of `python-docx`** Pinned `python-docx` version to ensure a particular method `unstructured` uses is included. * **Ingest preserves original file extension** Ingest V2 introduced a change that dropped the original extension for upgraded connectors. This reverts that change. ## 0.14.3 diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 325252e403..570d4949f3 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -38,10 +38,6 @@ opencv-python==4.8.0.76 opencv-contrib-python==4.8.0.76 platformdirs==3.10.0 -# Note(scanny): partition_docx() uses table features added in python-docx v1.1.2. Added here since -# multiple formats have a python-docx dependency (docx, odt) -python-docx>=1.1.2 - # TODO: Constraint due to langchain, remove when that gets updated: packaging<24.0 diff --git a/requirements/extra-docx.in b/requirements/extra-docx.in index ea1062fe66..296b718aaa 100644 --- a/requirements/extra-docx.in +++ b/requirements/extra-docx.in @@ -1,4 +1,5 @@ -c ./deps/constraints.txt -c base.txt -python-docx +# Note(scanny): partition_docx() uses table features added in python-docx v1.1.2. +python-docx>=1.1.2 diff --git a/requirements/extra-odt.in b/requirements/extra-odt.in index 29f9e05124..727a76f5e2 100644 --- a/requirements/extra-odt.in +++ b/requirements/extra-odt.in @@ -1,5 +1,6 @@ -c ./deps/constraints.txt -c base.txt -python-docx +# Note(scanny): partition_docx() uses table features added in python-docx v1.1.2. +python-docx>=1.1.2 pypandoc diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 23bad8442f..469f3223a0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.4-dev1" # pragma: no cover +__version__ = "0.14.4-dev2" # pragma: no cover From 23e570fc8ac71c5d1a5788dcb5b3a7a7a57bf078 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 30 May 2024 12:22:54 -0400 Subject: [PATCH 07/13] docs: cleanup readme; add python 3.12 (#3120) ### Summary Updates documentation references in the README to point to https://docs.unstructured.io and cleans up a few sections of the README. Specifically: - Removes an old API announcement - Removes the section mentioning Chipper as a beta feature. Chipper is only available through the SaaS API. Also adds a Python 3.12 tag to `setup.py` since we now support Python 3.12. --- README.md | 50 +++++++++++++++----------------------------------- setup.py | 1 + 2 files changed, 16 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 65960d2624..ca8d97c595 100644 --- a/README.md +++ b/README.md @@ -37,21 +37,7 @@

Open-Source Pre-Processing Tools for Unstructured Data

-The `unstructured` library provides open-source components for ingesting and pre-processing images and text documents, such as PDFs, HTML, Word docs, and [many more](https://unstructured-io.github.io/unstructured/core.html#partitioning). The use cases of `unstructured` revolve around streamlining and optimizing the data processing workflow for LLMs. `unstructured` modular functions and connectors form a cohesive system that simplifies data ingestion and pre-processing, making it adaptable to different platforms and efficient in transforming unstructured data into structured outputs. - -

-

API Announcement!

-

- -We are thrilled to announce our newly launched [Unstructured API](https://unstructured-io.github.io/unstructured/api.html), providing the Unstructured capabilities from `unstructured` as an API. Check out the [`unstructured-api` GitHub repository](https://github.com/Unstructured-IO/unstructured-api) to start making API calls. You’ll also find instructions about how to host your own API version. - -While access to the hosted Unstructured API will remain free, API Keys are required to make requests. To prevent disruption, get yours [here](https://unstructured.io/api-key) and start using it today! Check out the [`unstructured-api` README](https://github.com/Unstructured-IO/unstructured-api#--) to start making API calls.

- -#### :rocket: Beta Feature: Chipper Model - -We are releasing the beta version of our Chipper model to deliver superior performance when processing high-resolution, complex documents. To start using the Chipper model in your API request, you can utilize the `hi_res_model_name=chipper` parameter. Please refer to the documentation [here](https://unstructured-io.github.io/unstructured/api.html#beta-version-hi-res-strategy-with-chipper-model). - -As the Chipper model is in beta version, we welcome feedback and suggestions. For those interested in testing the Chipper model, we encourage you to connect with us on [Slack community](https://short.unstructured.io/pzw05l7). +The `unstructured` library provides open-source components for ingesting and pre-processing images and text documents, such as PDFs, HTML, Word docs, and [many more](https://docs.unstructured.io/open-source/core-functionality/partitioning). The use cases of `unstructured` revolve around streamlining and optimizing the data processing workflow for LLMs. `unstructured` modular functions and connectors form a cohesive system that simplifies data ingestion and pre-processing, making it adaptable to different platforms and efficient in transforming unstructured data into structured outputs. ## :eight_pointed_black_star: Quick Start @@ -182,29 +168,23 @@ This starts a docker container with your local repo mounted to `/mnt/local_unstr ## :clap: Quick Tour ### Documentation -This README overviews how to install, use and develop the library. For more comprehensive documentation, visit https://unstructured-io.github.io/unstructured/ . +For more comprehensive documentation, visit https://docs.unstructured.io . You can also learn +more about our other products on the documentation page, including our SaaS API. -### Concepts Guide +Here are a few pages from the [Open Source documentation page](https://docs.unstructured.io/open-source/introduction/overview) +that are helpful for new users to review: -The `unstructured` library includes core functionality for partitioning, chunking, cleaning, and -staging raw documents for NLP tasks. -You can see a complete list of available functions and how to use them from the [Core Functionality documentation](https://unstructured-io.github.io/unstructured/core.html). +- [Quick Start](https://docs.unstructured.io/open-source/introduction/quick-start) +- [Using the `unstructured` open source package](https://docs.unstructured.io/open-source/core-functionality/overview) +- [Connectors](https://docs.unstructured.io/open-source/ingest/overview) +- [Concepts](https://docs.unstructured.io/open-source/concepts/document-elements) +- [Integrations](https://docs.unstructured.io/open-source/integrations) -In general, these functions fall into several categories: -- *Partitioning* functions break raw documents into standard, structured elements. -- *Cleaning* functions remove unwanted text from documents, such as boilerplate and sentence fragments. -- *Staging* functions format data for downstream tasks, such as ML inference and data labeling. -- *Chunking* functions split documents into smaller sections for use in RAG apps and similarity - search. -- *Embedding* encoder classes provide an interfaces for easily converting preprocessed text to - vectors. - -The **Connectors** 🔗 in `unstructured` serve as vital links between the pre-processing pipeline and various data storage platforms. They allow for the batch processing of documents across various sources, including cloud services, repositories, and local directories. Each connector is tailored to a specific platform, such as Azure, Google Drive, or Github, and comes with unique commands and dependencies. To see the list of Connectors available in `unstructured` library, please check out the [Connectors GitHub folder](https://github.com/Unstructured-IO/unstructured/tree/main/unstructured/ingest/connector) and [documentation](https://unstructured-io.github.io/unstructured/ingest/index.html) ### PDF Document Parsing Example -The following examples show how to get started with the `unstructured` library. You can parse over a dozen document types with one line of code! Use this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the example below. - -The easiest way to parse a document in unstructured is to use the `partition` function. If you use `partition` function, `unstructured` will detect the file type and route it to the appropriate file-specific partitioning function. If you are using the `partition` function, you may need to install additional parameters via `pip install unstructured[local-inference]`. Ensure you first install `libmagic` using the instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection) `partition` will always apply the default arguments. If you need advanced features, use a document-specific partitioning function. +The following examples show how to get started with the `unstructured` library. The easiest way to parse a document in unstructured is to use the `partition` function. If you use `partition` function, `unstructured` will detect the file type and route it to the appropriate file-specific partitioning function. If you are using the `partition` function, you may need to install additional dependencies per doc type. +For example, to install docx dependencies you need to run `pip install "unstructured[docx]"`. +See our [installation guide](https://docs.unstructured.io/open-source/installation/full-installation) for more details. ```python from unstructured.partition.auto import partition @@ -245,7 +225,7 @@ Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of including document image classification [11, ``` -See the [partitioning](https://unstructured-io.github.io/unstructured/core.html#partitioning) +See the [partitioning](https://docs.unstructured.io/open-source/core-functionality/partitioning) section in our documentation for a full list of options and instructions on how to use file-specific partitioning functions. @@ -263,7 +243,7 @@ Encountered a bug? Please create a new [GitHub issue](https://github.com/Unstruc | Section | Description | |-|-| | [Company Website](https://unstructured.io) | Unstructured.io product and company info | -| [Documentation](https://unstructured-io.github.io/unstructured) | Full API documentation | +| [Documentation](https://docs.unstructured.io/) | Full API documentation | | [Batch Processing](unstructured/ingest/README.md) | Ingesting batches of documents through Unstructured | ## :chart_with_upwards_trend: Analytics diff --git a/setup.py b/setup.py index a51a842eaa..7adde89116 100644 --- a/setup.py +++ b/setup.py @@ -96,6 +96,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], author="Unstructured Technologies", From 1f8768750c7bc8dd149d19354ef9fc80e9fd58f9 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Fri, 31 May 2024 00:05:09 -0700 Subject: [PATCH 08/13] chore: add auth to s3 destination test (#3122) We should be validating the S3 Destination with authenticated requests, with credentials from a limited test user. ## Changes - Updates s3 destination test to point to a bucket that requires authentication. - Adds authentication to the s3 destination test request - Bonus: fix deserialization of S3ConnectionConfig for s3 V2 destination - Bonus: fix S3ConnectionConfig never registered for s3 V2 destination - Bonus: repair version and changelog version for consistency with -dev convention ## Testing Validated by changes to S3 destination ingest test --- .github/workflows/ci.yml | 2 ++ CHANGELOG.md | 3 ++- test_unstructured_ingest/dest/s3.sh | 16 +++++++++++----- unstructured/__version__.py | 2 +- .../ingest/v2/processes/connectors/fsspec/s3.py | 8 +++++--- 5 files changed, 21 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b1dc5bfe4f..3da1049343 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -391,6 +391,8 @@ jobs: env: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + S3_INGEST_TEST_ACCESS_KEY: ${{ secrets.S3_INGEST_TEST_ACCESS_KEY }} + S3_INGEST_TEST_SECRET_KEY: ${{ secrets.S3_INGEST_TEST_SECRET_KEY }} AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 73725edd1f..e07f7b55b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.4-dev2 +## 0.14.4-dev3 ### Enhancements @@ -6,6 +6,7 @@ ### Fixes +* **Fix V2 S3 Destination Connector authentication** Fixes bugs with S3 Destination Connector where the connection config was neither registered nor properly deserialized. * **Clarified dependence on particular version of `python-docx`** Pinned `python-docx` version to ensure a particular method `unstructured` uses is included. * **Ingest preserves original file extension** Ingest V2 introduced a change that dropped the original extension for upgraded connectors. This reverts that change. diff --git a/test_unstructured_ingest/dest/s3.sh b/test_unstructured_ingest/dest/s3.sh index 99b95fbee8..b992ebf7ad 100755 --- a/test_unstructured_ingest/dest/s3.sh +++ b/test_unstructured_ingest/dest/s3.sh @@ -9,17 +9,22 @@ OUTPUT_FOLDER_NAME=s3-dest OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -DESTINATION_S3="s3://utic-dev-tech-fixtures/utic-ingest-test-fixtures-output/$(uuidgen)/" +DESTINATION_S3="s3://utic-ingest-test-fixtures/destination/$(uuidgen)/" CI=${CI:-"false"} +if [ -z "$S3_INGEST_TEST_ACCESS_KEY" ] || [ -z "$S3_INGEST_TEST_SECRET_KEY" ]; then + echo "Skipping S3 ingest test because S3_INGEST_TEST_ACCESS_KEY or S3_INGEST_TEST_SECRET_KEY env var is not set." + exit 8 +fi + # shellcheck disable=SC1091 source "$SCRIPT_DIR"/cleanup.sh function cleanup() { cleanup_dir "$WORK_DIR" - if aws s3 ls "$DESTINATION_S3" --region us-east-2; then + if AWS_ACCESS_KEY_ID="$S3_INGEST_TEST_ACCESS_KEY" AWS_SECRET_ACCESS_KEY="$S3_INGEST_TEST_SECRET_KEY" aws s3 ls "$DESTINATION_S3" --region us-east-2; then echo "deleting destination s3 location: $DESTINATION_S3" - aws s3 rm "$DESTINATION_S3" --recursive --region us-east-2 + AWS_ACCESS_KEY_ID="$S3_INGEST_TEST_ACCESS_KEY" AWS_SECRET_ACCESS_KEY="$S3_INGEST_TEST_SECRET_KEY" aws s3 rm "$DESTINATION_S3" --recursive --region us-east-2 fi } @@ -35,12 +40,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --input-path example-docs/fake-memo.pdf \ --work-dir "$WORK_DIR" \ s3 \ - --anonymous \ + --key "$S3_INGEST_TEST_ACCESS_KEY" \ + --secret "$S3_INGEST_TEST_SECRET_KEY" \ --remote-url "$DESTINATION_S3" # Simply check the number of files uploaded expected_num_files=1 -num_files_in_s3=$(aws s3 ls "${DESTINATION_S3}" --region us-east-2 | grep -c "\.json$") +num_files_in_s3=$(AWS_ACCESS_KEY_ID="$S3_INGEST_TEST_ACCESS_KEY" AWS_SECRET_ACCESS_KEY="$S3_INGEST_TEST_SECRET_KEY" aws s3 ls "${DESTINATION_S3}" --region us-east-2 | grep -c "\.json$") if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files." exit 1 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 469f3223a0..2846481ea6 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.4-dev2" # pragma: no cover +__version__ = "0.14.4-dev3" # pragma: no cover diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/s3.py b/unstructured/ingest/v2/processes/connectors/fsspec/s3.py index de2f740408..52e8421024 100644 --- a/unstructured/ingest/v2/processes/connectors/fsspec/s3.py +++ b/unstructured/ingest/v2/processes/connectors/fsspec/s3.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from dataclasses import dataclass, field from datetime import datetime from pathlib import Path @@ -165,5 +163,9 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non add_destination_entry( destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry(uploader=S3Upload, uploader_config=S3UploaderConfig), + entry=DestinationRegistryEntry( + uploader=S3Upload, + uploader_config=S3UploaderConfig, + connection_config=S3ConnectionConfig, + ), ) From 865ef496e656645263d6498efd319ba8f1ea184a Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 31 May 2024 11:24:41 -0400 Subject: [PATCH 09/13] ci: update `pinecone` test to use serverless (#3127) ### Summary Closes #3068. Updates the Pinecone connector tests to use serverless indexes, per the documentation [here](https://docs.pinecone.io/reference/api/control-plane/create_index). Also updates the CHANGELOG to mention serverless. Turns out we already supported it with the client version bump, but it hadn't been tested yet. ### Testing See [this CI job](https://github.com/Unstructured-IO/unstructured/actions/runs/9319836670/job/25655322433?pr=3127) that passed, running only the Pinecone test. --- CHANGELOG.md | 5 ++++- test_unstructured_ingest/dest/pinecone.sh | 16 ++++++++++------ unstructured/__version__.py | 2 +- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e07f7b55b9..5c6f0aab29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,10 @@ -## 0.14.4-dev3 +## 0.14.4-dev4 ### Enhancements +* **Add support for Pinecone serverless** Adds Pinecone serverless to the connector tests. Pinecone + serverless will work version versions >=0.14.2, but hadn't been tested until now. + ### Features ### Fixes diff --git a/test_unstructured_ingest/dest/pinecone.sh b/test_unstructured_ingest/dest/pinecone.sh index e9badb7375..6891063c27 100755 --- a/test_unstructured_ingest/dest/pinecone.sh +++ b/test_unstructured_ingest/dest/pinecone.sh @@ -32,7 +32,7 @@ function cleanup { -s -o /dev/null \ -w "%{http_code}" \ --request GET \ - --url "https://controller.$PINECONE_ENVIRONMENT.pinecone.io/databases/$PINECONE_INDEX" \ + --url "https://api.pinecone.io/indexes/$PINECONE_INDEX" \ --header 'accept: application/json' \ --header "Api-Key: $PINECONE_API_KEY") @@ -41,7 +41,7 @@ function cleanup { echo "" echo "deleting index $PINECONE_INDEX" curl --request DELETE \ - "https://controller.$PINECONE_ENVIRONMENT.pinecone.io/databases/$PINECONE_INDEX" \ + "https://api.pinecone.io/indexes/$PINECONE_INDEX" \ --header "Api-Key: $PINECONE_API_KEY" \ --header 'content-type: application/json' @@ -61,8 +61,8 @@ response_code=$(curl \ -s -o /dev/null \ -w "%{http_code}" \ --request POST \ - --url "https://controller.$PINECONE_ENVIRONMENT.pinecone.io/databases" \ - --header "accept: text/plain" \ + --url "https://api.pinecone.io/indexes" \ + --header "accept: application/json" \ --header "content-type: application/json" \ --header "Api-Key: $PINECONE_API_KEY" \ --data ' @@ -70,8 +70,12 @@ response_code=$(curl \ "name": "'"$PINECONE_INDEX"'", "dimension": 384, "metric": "cosine", - "pods": 1, - "pod_type": "p1.x1" + "spec": { + "serverless": { + "cloud": "aws", + "region": "us-east-1" + } + } } ') diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2846481ea6..d108dd3c0f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.4-dev3" # pragma: no cover +__version__ = "0.14.4-dev4" # pragma: no cover From 4a96d549062d4b3ff03c3f3be244b08cb52b046d Mon Sep 17 00:00:00 2001 From: Yuming Long <63475068+yuming-long@users.noreply.github.com> Date: Fri, 31 May 2024 13:58:36 -0400 Subject: [PATCH 10/13] chore: move logger error to debug when pdfminer extract fails (#3028) ### Summary We are seeing logger error `Invalid dictionary construct` for hosted APIs, move this logger error to debug level - we still continue partition when pdfminer text extraction fails as before (just don't throw the log error anymore) ### Test I was able to reproduce the logger error with an internal only file (please DM me if needed) and the error trace look like ``` File "/Users/yumingl/develops/unstructured/unstructured/partition/pdf.py", line 709, in _process_pdfminer_pages annotation_list = get_uris(page.annots, height, coordinate_system, page_number) File "/Users/yumingl/develops/unstructured/unstructured/partition/pdf.py", line 1049, in get_uris resolved_annots = annots.resolve() ... ``` we also won't be able to repair pdf structure on `get_uris` (not a page level) so move this exception to debug level. --- CHANGELOG.md | 3 ++- unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c6f0aab29..f6d0c87ab8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.14.4-dev4 +## 0.14.4-dev5 ### Enhancements +* **Move logger error to debug level when PDFminer fails to extract text** which includes error message for Invalid dictionary construct. * **Add support for Pinecone serverless** Adds Pinecone serverless to the connector tests. Pinecone serverless will work version versions >=0.14.2, but hadn't been tested until now. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d108dd3c0f..ff19fe3450 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.4-dev4" # pragma: no cover +__version__ = "0.14.4-dev5" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index d38a924a51..724145b0b5 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -266,8 +266,8 @@ def partition_pdf_or_image( for el in page_elements ) except Exception as e: - logger.error(e) - logger.warning("PDF text extraction failed, skip text extraction...") + logger.debug(e) + logger.info("PDF text extraction failed, skip text extraction...") strategy = determine_pdf_or_image_strategy( strategy, From 6005abce79f3cbf79d294dcd163598c0a9a02886 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 31 May 2024 14:41:04 -0400 Subject: [PATCH 11/13] feat: configure googlevisionapi (#3126) ### Summary Includes changes from #3117. Merged into a feature branch to run the full test suite. Original PR description: The Google Vision API allows for [configuration of the API endpoint](https://cloud.google.com/vision/docs/ocr#regionalization), to select if the data should be sent to the US or the EU. This PR adds an environment variable (`GOOGLEVISION_API_ENDPOINT`) to configure it. --------- Co-authored-by: JIAQIA Co-authored-by: Dimitri Lozeve --- CHANGELOG.md | 2 ++ unstructured/partition/utils/config.py | 9 ++++++--- .../partition/utils/ocr_models/google_vision_ocr.py | 12 +++++++++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f6d0c87ab8..7982a7165f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ ### Features +- **Allow configuration of the Google Vision API endpoint** Add an environment variable to select the Google Vision API in the US or the EU. + ### Fixes * **Fix V2 S3 Destination Connector authentication** Fixes bugs with S3 Destination Connector where the connection config was neither registered nor properly deserialized. diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 270f94bfc1..4f78739a17 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -95,11 +95,14 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int: """optimum text height for tesseract OCR""" return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20) + @property + def GOOGLEVISION_API_ENDPOINT(self) -> str: + """API endpoint to use for Google Vision""" + return self._get_string("GOOGLEVISION_API_ENDPOINT", "") + @property def OCR_AGENT(self) -> str: - """error margin when comparing if a ocr region is within the table element when preparing - table tokens - """ + """OCR Agent to use""" return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT) @property diff --git a/unstructured/partition/utils/ocr_models/google_vision_ocr.py b/unstructured/partition/utils/ocr_models/google_vision_ocr.py index 231a10904c..e9080f1130 100644 --- a/unstructured/partition/utils/ocr_models/google_vision_ocr.py +++ b/unstructured/partition/utils/ocr_models/google_vision_ocr.py @@ -5,6 +5,8 @@ from google.cloud.vision import Image, ImageAnnotatorClient, Paragraph, TextAnnotation +from unstructured.logger import logger, trace_logger +from unstructured.partition.utils.config import env_config from unstructured.partition.utils.constants import Source from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent @@ -18,7 +20,14 @@ class OCRAgentGoogleVision(OCRAgent): """OCR service implementation for Google Vision API.""" def __init__(self) -> None: - self.client = ImageAnnotatorClient() + client_options = {} + api_endpoint = env_config.GOOGLEVISION_API_ENDPOINT + if api_endpoint: + logger.info(f"Using Google Vision OCR with endpoint {api_endpoint}") + client_options["api_endpoint"] = api_endpoint + else: + logger.info("Using Google Vision OCR with default endpoint") + self.client = ImageAnnotatorClient(client_options=client_options) def is_text_sorted(self) -> bool: return True @@ -34,6 +43,7 @@ def get_text_from_image(self, image: PILImage.Image, ocr_languages: str = "eng") def get_layout_from_image( self, image: PILImage.Image, ocr_languages: str = "eng" ) -> list[TextRegion]: + trace_logger.detail("Processing entire page OCR with Google Vision API...") with BytesIO() as buffer: image.save(buffer, format="PNG") response = self.client.document_text_detection(image=Image(content=buffer.getvalue())) From 54c1e4e57fe6fc2d6eeb2fca18d67a9f5f771569 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 31 May 2024 18:00:40 -0400 Subject: [PATCH 12/13] ci: remove jira issue workflow (#3129) ### Summary Removes the workflow for creating Jira tickets. --- .github/workflows/create_issue.yml | 34 ------------------------------ 1 file changed, 34 deletions(-) delete mode 100644 .github/workflows/create_issue.yml diff --git a/.github/workflows/create_issue.yml b/.github/workflows/create_issue.yml deleted file mode 100644 index 15fc196f1f..0000000000 --- a/.github/workflows/create_issue.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: create_jira_issue - -on: - issues: - types: - - opened - -jobs: - create: - runs-on: ubuntu-latest - name: Create JIRA Issue - steps: - - - name: Login to Jira - uses: atlassian/gajira-login@v3 - env: - JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }} - JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }} - JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }} - - - name: Create Jira issue - uses: atlassian/gajira-create@v3 - with: - project: CORE - issuetype: Task - summary: ${{ github.event.issue.title }} - description: | - Created from github issue: ${{ github.event.issue.html_url }} - ---- - ${{ github.event.issue.body }} - fields: '{ "labels": ["github-issue"] }' - - - name: Log created issue - run: echo "Issue ${{ steps.create.outputs.issue }} was created" From 1b43102762d17924cdb01f14c808b122e98c5aba Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 31 May 2024 18:07:38 -0400 Subject: [PATCH 13/13] fix: remote root handlers when they exist (#3128) ### Summary In some environments, such as Google Colab, loggers have a root handling that did not mask sensitive values. As a result, secrets such as API keys appeared in the logs. The PR removes root handlers when they exist to ensure sensitive values are handler properly. ### Testing Run the following in a Colab notebook. You should see two log outputs, one with the API key masked and one with it exposed. ``` !pip install unstructured ``` ```python import logging import json from unstructured.ingest.interfaces import ( ChunkingConfig, EmbeddingConfig, PartitionConfig, ProcessorConfig, ReadConfig, ) partition_config = PartitionConfig( partition_by_api=True, api_key="super secret", ) from unstructured.ingest.logger import ingest_log_streaming_init ingest_log_streaming_init(logging.INFO) logger = logging.getLogger("unstructured.ingest") logger.setLevel(logging.INFO) logger.info( f"Running partition node to extract content from json files. " f"Config: {partition_config.to_json()}, " ) ``` Now replace the first cell with the following and rerun the Python code. Only the masked logging output should remain. ``` !git clone https://github.com/Unstructured-IO/unstructured.git && cd unstructured && git checkout fix/rm-log-dupes && pip install -e . ``` --- CHANGELOG.md | 3 ++- unstructured/__version__.py | 2 +- unstructured/ingest/logger.py | 11 +++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7982a7165f..51f9cbf5b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.4-dev5 +## 0.14.4-dev6 ### Enhancements @@ -12,6 +12,7 @@ ### Fixes +* **Remove root handlers in ingest logger**. Removes root handlers in ingest loggers to ensure secrets aren't accidentally exposed in Colab notebooks. * **Fix V2 S3 Destination Connector authentication** Fixes bugs with S3 Destination Connector where the connection config was neither registered nor properly deserialized. * **Clarified dependence on particular version of `python-docx`** Pinned `python-docx` version to ensure a particular method `unstructured` uses is included. * **Ingest preserves original file extension** Ingest V2 introduced a change that dropped the original extension for upgraded connectors. This reverts that change. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ff19fe3450..c14a58bf38 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.4-dev5" # pragma: no cover +__version__ = "0.14.4-dev6" # pragma: no cover diff --git a/unstructured/ingest/logger.py b/unstructured/ingest/logger.py index 6970c0ef03..ed4e7180e5 100644 --- a/unstructured/ingest/logger.py +++ b/unstructured/ingest/logger.py @@ -94,6 +94,15 @@ def format(self, record): return redact_jsons(s) +def remove_root_handlers(logger: logging.Logger) -> None: + # NOTE(robinson) - in some environments such as Google Colab, there is a root handler + # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs. + # Removing these when they exist prevents this behavior + if logger.root.hasHandlers(): + for handler in logger.root.handlers: + logger.root.removeHandler(handler) + + def ingest_log_streaming_init(level: int) -> None: handler = logging.StreamHandler() handler.name = "ingest_log_handler" @@ -104,6 +113,7 @@ def ingest_log_streaming_init(level: int) -> None: if "ingest_log_handler" not in [h.name for h in logger.handlers]: logger.addHandler(handler) + remove_root_handlers(logger) logger.setLevel(level) @@ -116,4 +126,5 @@ def make_default_logger(level: int) -> logging.Logger: handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(level) + remove_root_handlers(logger) return logger