Skip to content

Commit

Permalink
create a single parquet output
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm committed Aug 30, 2024
1 parent 3e789df commit 6b84ade
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions examples/export_multimodal.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import logging
import time
from pathlib import Path
Expand All @@ -18,7 +19,7 @@ def main():
logging.basicConfig(level=logging.INFO)

input_doc_paths = [
Path("./test/data/2206.01062.pdf"),
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")

Expand All @@ -37,10 +38,13 @@ def main():

converted_docs = doc_converter.convert(input_files)

success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1
continue

doc_filename = doc.input.file.stem
Expand Down Expand Up @@ -73,15 +77,23 @@ def main():
},
}
)
df = pd.json_normalize(rows)
success_count += 1

output_filename = output_dir / f"{doc_filename}.parquet"
df.to_parquet(output_filename)
# Generate one parquet from all documents
df = pd.json_normalize(rows)
now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename)

end_time = time.time() - start_time

_log.info(f"All documents were converted in {end_time:.2f} seconds.")

if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)


if __name__ == "__main__":
main()

0 comments on commit 6b84ade

Please sign in to comment.