From 09fce4bddbb8006687d84151fb4ea0e1b7afb536 Mon Sep 17 00:00:00 2001 From: jose-jaen Date: Fri, 1 Nov 2024 21:47:20 +0100 Subject: [PATCH 1/3] feat: integrate polars into quipus Temporary replacement of pandas with polars. The immediate consequence is relaxing the assumption that all data fits into memory, and guaranteed performance improvement for large datasets. Tests had to be modified as they heavily depended on pandas. New dependencies were added to the pyproject.toml file since polars required additional packages. Pandas related ones and openpyxl were not removed as this project will still support both in the foreseeable future. --- poetry.lock | 172 ++++++++++++++++++- pyproject.toml | 3 + quipus/data_sources/csv_data_source.py | 59 ++++--- quipus/data_sources/dataframe_data_source.py | 32 ++-- quipus/data_sources/xlsx_data_source.py | 50 +++--- quipus/models/certificate_factory.py | 20 ++- quipus/services/template_manager.py | 6 +- tests/test_certificate_factory.py | 22 ++- tests/test_csv_source.py | 47 +++-- tests/test_dataframe_source.py | 15 +- tests/test_xlsx_source.py | 43 ++--- 11 files changed, 329 insertions(+), 140 deletions(-) diff --git a/poetry.lock b/poetry.lock index 16f24b3..1dcbc07 100644 --- a/poetry.lock +++ b/poetry.lock @@ -150,6 +150,10 @@ files = [ {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a37b8f0391212d29b3a91a799c8e4a2855e0576911cdfb2515487e30e322253d"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e84799f09591700a4154154cab9787452925578841a94321d5ee8fb9a9a328f0"}, {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f66b5337fa213f1da0d9000bc8dc0cb5b896b726eefd9c6046f699b169c41b9e"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5dab0844f2cf82be357a0eb11a9087f70c5430b2c241493fc122bb6f2bb0917c"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e4fe605b917c70283db7dfe5ada75e04561479075761a0b3866c081d035b01c1"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1e9a65b5736232e7a7f91ff3d02277f11d339bf34099a56cdab6a8b3410a02b2"}, + {file = "Brotli-1.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58d4b711689366d4a03ac7957ab8c28890415e267f9b6589969e74b6e42225ec"}, {file = "Brotli-1.1.0-cp310-cp310-win32.whl", hash = "sha256:be36e3d172dc816333f33520154d708a2657ea63762ec16b62ece02ab5e4daf2"}, {file = "Brotli-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:0c6244521dda65ea562d5a69b9a26120769b7a9fb3db2fe9545935ed6735b128"}, {file = "Brotli-1.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a3daabb76a78f829cafc365531c972016e4aa8d5b4bf60660ad8ecee19df7ccc"}, @@ -162,8 +166,14 @@ files = [ {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:19c116e796420b0cee3da1ccec3b764ed2952ccfcc298b55a10e5610ad7885f9"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:510b5b1bfbe20e1a7b3baf5fed9e9451873559a976c1a78eebaa3b86c57b4265"}, {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a1fd8a29719ccce974d523580987b7f8229aeace506952fa9ce1d53a033873c8"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c247dd99d39e0338a604f8c2b3bc7061d5c2e9e2ac7ba9cc1be5a69cb6cd832f"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1b2c248cd517c222d89e74669a4adfa5577e06ab68771a529060cf5a156e9757"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2a24c50840d89ded6c9a8fdc7b6ed3692ed4e86f1c4a4a938e1e92def92933e0"}, + {file = "Brotli-1.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f31859074d57b4639318523d6ffdca586ace54271a73ad23ad021acd807eb14b"}, {file = "Brotli-1.1.0-cp311-cp311-win32.whl", hash = "sha256:39da8adedf6942d76dc3e46653e52df937a3c4d6d18fdc94a7c29d263b1f5b50"}, {file = "Brotli-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:aac0411d20e345dc0920bdec5548e438e999ff68d77564d5e9463a7ca9d3e7b1"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:32d95b80260d79926f5fab3c41701dbb818fde1c9da590e77e571eefd14abe28"}, + {file = "Brotli-1.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b760c65308ff1e462f65d69c12e4ae085cff3b332d894637f6273a12a482d09f"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:316cc9b17edf613ac76b1f1f305d2a748f1b976b033b049a6ecdfd5612c70409"}, {file = "Brotli-1.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:caf9ee9a5775f3111642d33b86237b05808dafcd6268faa492250e9b78046eb2"}, {file = "Brotli-1.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70051525001750221daa10907c77830bc889cb6d865cc0b813d9db7fefc21451"}, @@ -174,8 +184,24 @@ files = [ {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:4093c631e96fdd49e0377a9c167bfd75b6d0bad2ace734c6eb20b348bc3ea180"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7e4c4629ddad63006efa0ef968c8e4751c5868ff0b1c5c40f76524e894c50248"}, {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:861bf317735688269936f755fa136a99d1ed526883859f86e41a5d43c61d8966"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:87a3044c3a35055527ac75e419dfa9f4f3667a1e887ee80360589eb8c90aabb9"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c5529b34c1c9d937168297f2c1fde7ebe9ebdd5e121297ff9c043bdb2ae3d6fb"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ca63e1890ede90b2e4454f9a65135a4d387a4585ff8282bb72964fab893f2111"}, + {file = "Brotli-1.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e79e6520141d792237c70bcd7a3b122d00f2613769ae0cb61c52e89fd3443839"}, {file = "Brotli-1.1.0-cp312-cp312-win32.whl", hash = "sha256:5f4d5ea15c9382135076d2fb28dde923352fe02951e66935a9efaac8f10e81b0"}, {file = "Brotli-1.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:906bc3a79de8c4ae5b86d3d75a8b77e44404b0f4261714306e3ad248d8ab0951"}, + {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8bf32b98b75c13ec7cf774164172683d6e7891088f6316e54425fde1efc276d5"}, + {file = "Brotli-1.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7bc37c4d6b87fb1017ea28c9508b36bbcb0c3d18b4260fcdf08b200c74a6aee8"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c0ef38c7a7014ffac184db9e04debe495d317cc9c6fb10071f7fefd93100a4f"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91d7cc2a76b5567591d12c01f019dd7afce6ba8cba6571187e21e2fc418ae648"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a93dde851926f4f2678e704fadeb39e16c35d8baebd5252c9fd94ce8ce68c4a0"}, + {file = "Brotli-1.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0db75f47be8b8abc8d9e31bc7aad0547ca26f24a54e6fd10231d623f183d089"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6967ced6730aed543b8673008b5a391c3b1076d834ca438bbd70635c73775368"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7eedaa5d036d9336c95915035fb57422054014ebdeb6f3b42eac809928e40d0c"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d487f5432bf35b60ed625d7e1b448e2dc855422e87469e3f450aa5552b0eb284"}, + {file = "Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7"}, + {file = "Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0"}, + {file = "Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b"}, {file = "Brotli-1.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a090ca607cbb6a34b0391776f0cb48062081f5f60ddcce5d11838e67a01928d1"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de9d02f5bda03d27ede52e8cfe7b865b066fa49258cbab568720aa5be80a47d"}, {file = "Brotli-1.1.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2333e30a5e00fe0fe55903c8832e08ee9c3b1382aacf4db26664a16528d51b4b"}, @@ -185,6 +211,10 @@ files = [ {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:069a121ac97412d1fe506da790b3e69f52254b9df4eb665cd42460c837193354"}, {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e93dfc1a1165e385cc8239fab7c036fb2cd8093728cbd85097b284d7b99249a2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:aea440a510e14e818e67bfc4027880e2fb500c2ccb20ab21c7a7c8b5b4703d75"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_i686.whl", hash = "sha256:6974f52a02321b36847cd19d1b8e381bf39939c21efd6ee2fc13a28b0d99348c"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:a7e53012d2853a07a4a79c00643832161a910674a893d296c9f1259859a289d2"}, + {file = "Brotli-1.1.0-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:d7702622a8b40c49bffb46e1e3ba2e81268d5c04a34f460978c6b5517a34dd52"}, {file = "Brotli-1.1.0-cp36-cp36m-win32.whl", hash = "sha256:a599669fd7c47233438a56936988a2478685e74854088ef5293802123b5b2460"}, {file = "Brotli-1.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d143fd47fad1db3d7c27a1b1d66162e855b5d50a89666af46e1679c496e8e579"}, {file = "Brotli-1.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:11d00ed0a83fa22d29bc6b64ef636c4552ebafcef57154b4ddd132f5638fbd1c"}, @@ -196,6 +226,10 @@ files = [ {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:919e32f147ae93a09fe064d77d5ebf4e35502a8df75c29fb05788528e330fe74"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:23032ae55523cc7bccb4f6a0bf368cd25ad9bcdcc1990b64a647e7bbcce9cb5b"}, {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:224e57f6eac61cc449f498cc5f0e1725ba2071a3d4f48d5d9dffba42db196438"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:cb1dac1770878ade83f2ccdf7d25e494f05c9165f5246b46a621cc849341dc01"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:3ee8a80d67a4334482d9712b8e83ca6b1d9bc7e351931252ebef5d8f7335a547"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:5e55da2c8724191e5b557f8e18943b1b4839b8efc3ef60d65985bcf6f587dd38"}, + {file = "Brotli-1.1.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:d342778ef319e1026af243ed0a07c97acf3bad33b9f29e7ae6a1f68fd083e90c"}, {file = "Brotli-1.1.0-cp37-cp37m-win32.whl", hash = "sha256:587ca6d3cef6e4e868102672d3bd9dc9698c309ba56d41c2b9c85bbb903cdb95"}, {file = "Brotli-1.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2954c1c23f81c2eaf0b0717d9380bd348578a94161a65b3a2afc62c86467dd68"}, {file = "Brotli-1.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:efa8b278894b14d6da122a72fefcebc28445f2d3f880ac59d46c90f4c13be9a3"}, @@ -208,6 +242,10 @@ files = [ {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ab4fbee0b2d9098c74f3057b2bc055a8bd92ccf02f65944a241b4349229185a"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:141bd4d93984070e097521ed07e2575b46f817d08f9fa42b16b9b5f27b5ac088"}, {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fce1473f3ccc4187f75b4690cfc922628aed4d3dd013d047f95a9b3919a86596"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d2b35ca2c7f81d173d2fadc2f4f31e88cc5f7a39ae5b6db5513cf3383b0e0ec7"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:af6fa6817889314555aede9a919612b23739395ce767fe7fcbea9a80bf140fe5"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2feb1d960f760a575dbc5ab3b1c00504b24caaf6986e2dc2b01c09c87866a943"}, + {file = "Brotli-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4410f84b33374409552ac9b6903507cdb31cd30d2501fc5ca13d18f73548444a"}, {file = "Brotli-1.1.0-cp38-cp38-win32.whl", hash = "sha256:db85ecf4e609a48f4b29055f1e144231b90edc90af7481aa731ba2d059226b1b"}, {file = "Brotli-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3d7954194c36e304e1523f55d7042c59dc53ec20dd4e9ea9d151f1b62b4415c0"}, {file = "Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a"}, @@ -220,6 +258,10 @@ files = [ {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:949f3b7c29912693cee0afcf09acd6ebc04c57af949d9bf77d6101ebb61e388c"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:89f4988c7203739d48c6f806f1e87a1d96e0806d44f0fba61dba81392c9e474d"}, {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:de6551e370ef19f8de1807d0a9aa2cdfdce2e85ce88b122fe9f6b2b076837e59"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0737ddb3068957cf1b054899b0883830bb1fec522ec76b1098f9b6e0f02d9419"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4f3607b129417e111e30637af1b56f24f7a49e64763253bbc275c75fa887d4b2"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:6c6e0c425f22c1c719c42670d561ad682f7bfeeef918edea971a79ac5252437f"}, + {file = "Brotli-1.1.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:494994f807ba0b92092a163a0a283961369a65f6cbe01e8891132b7a320e61eb"}, {file = "Brotli-1.1.0-cp39-cp39-win32.whl", hash = "sha256:f0d8a7a6b5983c2496e364b969f0e526647a06b075d034f3297dc66f3b360c64"}, {file = "Brotli-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdad5b9014d83ca68c25d2e9444e28e967ef16e80f6b436918c700c117a85467"}, {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"}, @@ -462,6 +504,28 @@ files = [ {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, ] +[[package]] +name = "fastexcel" +version = "0.12.0" +description = "A fast excel file reader for Python, written in Rust" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fastexcel-0.12.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:d40b2c8ccb122e15cf89c2b972a679a937eca3e90b3e69c6db24f3666b11cff9"}, + {file = "fastexcel-0.12.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:2aade78706bc3f7a5861083267a038a49e809f3ee1abe6cceda7b8420092e61e"}, + {file = "fastexcel-0.12.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:993b905e61b98eb45a33409ac78b8a14b28bd3a3bcf9a4f36c1dae3e65c3dafb"}, + {file = "fastexcel-0.12.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de72203c67702010931bc4730ddb35904841b0ad0d8b6654c69b62c3d7b19eca"}, + {file = "fastexcel-0.12.0-cp38-abi3-win_amd64.whl", hash = "sha256:e5326fae6c28e2239dfdc19bc2cbb121b509e6f0aefa4e6e43b0cf84bd33dea6"}, + {file = "fastexcel-0.12.0.tar.gz", hash = "sha256:1624e2c6385fe08d5ac21392c3a5bd91156fbeebaf6986e6e7f684adc0e0ecbe"}, +] + +[package.dependencies] +pyarrow = ">=8.0.0" + +[package.extras] +pandas = ["pandas (>=1.4.4)"] +polars = ["polars (>=0.16.14)"] + [[package]] name = "fonttools" version = "4.54.1" @@ -946,6 +1010,47 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "polars" +version = "1.12.0" +description = "Blazingly fast DataFrame library" +optional = false +python-versions = ">=3.9" +files = [ + {file = "polars-1.12.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f3c4e4e423c373dda07b4c8a7ff12aa02094b524767d0ca306b1eba67f2d99e"}, + {file = "polars-1.12.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:aa6f9862f0cec6353243920d9b8d858c21ec8f25f91af203dea6ff91980e140d"}, + {file = "polars-1.12.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afb03647b5160737d2119532ee8ffe825de1d19d87f81bbbb005131786f7d59b"}, + {file = "polars-1.12.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:ea96aba5eb3dab8f0e6abf05ab3fc2136b329261860ef8661d20f5456a2d78e0"}, + {file = "polars-1.12.0-cp39-abi3-win_amd64.whl", hash = "sha256:a228a4b320a36d03a9ec9dfe7241b6d80a2f119b2dceb1da953166655e4cf43c"}, + {file = "polars-1.12.0.tar.gz", hash = "sha256:fb5c92de1a8f7d0a3f923fe48ea89eb518bdf55315ae917012350fa072bd64f4"}, +] + +[package.extras] +adbc = ["adbc-driver-manager[dbapi]", "adbc-driver-sqlite[dbapi]"] +all = ["polars[async,cloudpickle,database,deltalake,excel,fsspec,graph,iceberg,numpy,pandas,plot,pyarrow,pydantic,style,timezone]"] +async = ["gevent"] +calamine = ["fastexcel (>=0.9)"] +cloudpickle = ["cloudpickle"] +connectorx = ["connectorx (>=0.3.2)"] +database = ["nest-asyncio", "polars[adbc,connectorx,sqlalchemy]"] +deltalake = ["deltalake (>=0.15.0)"] +excel = ["polars[calamine,openpyxl,xlsx2csv,xlsxwriter]"] +fsspec = ["fsspec"] +gpu = ["cudf-polars-cu12"] +graph = ["matplotlib"] +iceberg = ["pyiceberg (>=0.5.0)"] +numpy = ["numpy (>=1.16.0)"] +openpyxl = ["openpyxl (>=3.0.0)"] +pandas = ["pandas", "polars[pyarrow]"] +plot = ["altair (>=5.4.0)"] +pyarrow = ["pyarrow (>=7.0.0)"] +pydantic = ["pydantic"] +sqlalchemy = ["polars[pandas]", "sqlalchemy"] +style = ["great-tables (>=0.8.0)"] +timezone = ["backports-zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +xlsxwriter = ["xlsxwriter"] + [[package]] name = "psycopg" version = "3.2.3" @@ -983,6 +1088,60 @@ files = [ [package.dependencies] typing-extensions = ">=4.6" +[[package]] +name = "pyarrow" +version = "18.0.0" +description = "Python library for Apache Arrow" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pyarrow-18.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2333f93260674e185cfbf208d2da3007132572e56871f451ba1a556b45dae6e2"}, + {file = "pyarrow-18.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:4c381857754da44326f3a49b8b199f7f87a51c2faacd5114352fc78de30d3aba"}, + {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:603cd8ad4976568954598ef0a6d4ed3dfb78aff3d57fa8d6271f470f0ce7d34f"}, + {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58a62549a3e0bc9e03df32f350e10e1efb94ec6cf63e3920c3385b26663948ce"}, + {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bc97316840a349485fbb137eb8d0f4d7057e1b2c1272b1a20eebbbe1848f5122"}, + {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:2e549a748fa8b8715e734919923f69318c953e077e9c02140ada13e59d043310"}, + {file = "pyarrow-18.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:606e9a3dcb0f52307c5040698ea962685fb1c852d72379ee9412be7de9c5f9e2"}, + {file = "pyarrow-18.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:d5795e37c0a33baa618c5e054cd61f586cf76850a251e2b21355e4085def6280"}, + {file = "pyarrow-18.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:5f0510608ccd6e7f02ca8596962afb8c6cc84c453e7be0da4d85f5f4f7b0328a"}, + {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:616ea2826c03c16e87f517c46296621a7c51e30400f6d0a61be645f203aa2b93"}, + {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1824f5b029ddd289919f354bc285992cb4e32da518758c136271cf66046ef22"}, + {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6dd1b52d0d58dd8f685ced9971eb49f697d753aa7912f0a8f50833c7a7426319"}, + {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:320ae9bd45ad7ecc12ec858b3e8e462578de060832b98fc4d671dee9f10d9954"}, + {file = "pyarrow-18.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:2c992716cffb1088414f2b478f7af0175fd0a76fea80841b1706baa8fb0ebaad"}, + {file = "pyarrow-18.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:e7ab04f272f98ebffd2a0661e4e126036f6936391ba2889ed2d44c5006237802"}, + {file = "pyarrow-18.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:03f40b65a43be159d2f97fd64dc998f769d0995a50c00f07aab58b0b3da87e1f"}, + {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be08af84808dff63a76860847c48ec0416928a7b3a17c2f49a072cac7c45efbd"}, + {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c70c1965cde991b711a98448ccda3486f2a336457cf4ec4dca257a926e149c9"}, + {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:00178509f379415a3fcf855af020e3340254f990a8534294ec3cf674d6e255fd"}, + {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a71ab0589a63a3e987beb2bc172e05f000a5c5be2636b4b263c44034e215b5d7"}, + {file = "pyarrow-18.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:fe92efcdbfa0bcf2fa602e466d7f2905500f33f09eb90bf0bcf2e6ca41b574c8"}, + {file = "pyarrow-18.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:907ee0aa8ca576f5e0cdc20b5aeb2ad4d3953a3b4769fc4b499e00ef0266f02f"}, + {file = "pyarrow-18.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:66dcc216ebae2eb4c37b223feaf82f15b69d502821dde2da138ec5a3716e7463"}, + {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc1daf7c425f58527900876354390ee41b0ae962a73ad0959b9d829def583bb1"}, + {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:871b292d4b696b09120ed5bde894f79ee2a5f109cb84470546471df264cae136"}, + {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:082ba62bdcb939824ba1ce10b8acef5ab621da1f4c4805e07bfd153617ac19d4"}, + {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:2c664ab88b9766413197733c1720d3dcd4190e8fa3bbdc3710384630a0a7207b"}, + {file = "pyarrow-18.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:dc892be34dbd058e8d189b47db1e33a227d965ea8805a235c8a7286f7fd17d3a"}, + {file = "pyarrow-18.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:28f9c39a56d2c78bf6b87dcc699d520ab850919d4a8c7418cd20eda49874a2ea"}, + {file = "pyarrow-18.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:f1a198a50c409ab2d009fbf20956ace84567d67f2c5701511d4dd561fae6f32e"}, + {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5bd7fd32e3ace012d43925ea4fc8bd1b02cc6cc1e9813b518302950e89b5a22"}, + {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:336addb8b6f5208be1b2398442c703a710b6b937b1a046065ee4db65e782ff5a"}, + {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:45476490dd4adec5472c92b4d253e245258745d0ccaabe706f8d03288ed60a79"}, + {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b46591222c864e7da7faa3b19455196416cd8355ff6c2cc2e65726a760a3c420"}, + {file = "pyarrow-18.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb7e3abcda7e1e6b83c2dc2909c8d045881017270a119cc6ee7fdcfe71d02df8"}, + {file = "pyarrow-18.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:09f30690b99ce34e0da64d20dab372ee54431745e4efb78ac938234a282d15f9"}, + {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5ca5d707e158540312e09fd907f9f49bacbe779ab5236d9699ced14d2293b8"}, + {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6331f280c6e4521c69b201a42dd978f60f7e129511a55da9e0bfe426b4ebb8d"}, + {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3ac24b2be732e78a5a3ac0b3aa870d73766dd00beba6e015ea2ea7394f8b4e55"}, + {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b30a927c6dff89ee702686596f27c25160dd6c99be5bcc1513a763ae5b1bfc03"}, + {file = "pyarrow-18.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:8f40ec677e942374e3d7f2fad6a67a4c2811a8b975e8703c6fd26d3b168a90e2"}, + {file = "pyarrow-18.0.0.tar.gz", hash = "sha256:a6aa027b1a9d2970cf328ccd6dbe4a996bc13c39fd427f502782f5bdb9ca20f5"}, +] + +[package.extras] +test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] + [[package]] name = "pycparser" version = "2.22" @@ -1251,6 +1410,17 @@ files = [ {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, ] +[[package]] +name = "xlsxwriter" +version = "3.2.0" +description = "A Python module for creating Excel XLSX files." +optional = false +python-versions = ">=3.6" +files = [ + {file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"}, + {file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"}, +] + [[package]] name = "zopfli" version = "0.2.3" @@ -1327,4 +1497,4 @@ test = ["pytest"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "24c0c9c9e3462e85c0c4602cd63e9c1e64a4fb7a14bdfe78c03431b8ba5f1a2a" +content-hash = "56bd7950c5ab8346d2a66f88c21e6f84ffce9e5eea33423f65f5c6506b49ed34" diff --git a/pyproject.toml b/pyproject.toml index c4dd27f..fa9688a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,9 +16,12 @@ weasyprint = "62.3" paramiko = "3.4.1,<3.5.0" boto3 = "^1.35.34" pandas = "^2.2.3" +polars = "^1.12.0" psycopg = "^3.2.3" psycopg-pool = "^3.2.3" openpyxl = "^3.1.5" +XlsxWriter = "^3.2.0" +fastexcel = "^0.12.0" [tool.poetry.group.dev.dependencies] pylint = "^3.3.1" diff --git a/quipus/data_sources/csv_data_source.py b/quipus/data_sources/csv_data_source.py index 70bdd9a..3f18959 100644 --- a/quipus/data_sources/csv_data_source.py +++ b/quipus/data_sources/csv_data_source.py @@ -1,6 +1,7 @@ -from typing import Optional, List +from pathlib import Path +from typing import Union, Optional, List -import pandas as pd +import polars as pl class CSVDataSource: @@ -8,53 +9,63 @@ class CSVDataSource: CSV DataSource class to manage data retrieval from CSV files. Attributes: - file_path (str): Path to the CSV file. + file_path (Union[Path, str]): Path to the CSV file. delimiter (str): Delimiter used in the CSV file. encoding (str): Encoding of the CSV file. - dataframe (Optional[pd.DataFrame]): Loaded data as a pandas DataFrame. + dataframe (Optional[pl.DataFrame]): Loaded data as a polars DataFrame. """ - def __init__(self, file_path: str, delimiter: str = ",", encoding: str = "utf-8"): + def __init__( + self, + file_path: Union[Path, str], + delimiter: str = ",", + encoding: str = "utf8" + ): self.file_path = file_path self.delimiter = delimiter self.encoding = encoding - self.dataframe: Optional[pd.DataFrame] = None + self.dataframe: Optional[pl.DataFrame] = None self.__load_data() def __load_data(self) -> None: """ - Load data from the CSV file into a pandas DataFrame. + Load data from the CSV file into a polars DataFrame. """ - self.dataframe = pd.read_csv( - self.file_path, delimiter=self.delimiter, encoding=self.encoding + self.dataframe = pl.read_csv( + source=self.file_path, + separator=self.delimiter, + encoding=self.encoding ) @property - def file_path(self) -> str: + def file_path(self) -> Union[Path, str]: """ Get the path to the CSV file. Returns: - str: Path to the CSV file. + Union[Path, str]: Path to the CSV file. """ return self.__file_path @file_path.setter - def file_path(self, file_path: str) -> None: + def file_path(self, file_path: Union[Path, str]) -> None: """ Set the path to the CSV file. Args: - file_path (str): Path to the CSV file. + file_path (Union[Path, str]): Path to the CSV file. Raises: TypeError: If 'file_path' is not a string. ValueError: If 'file_path' is an empty string. """ - if not isinstance(file_path, str): - raise TypeError("'file_path' must be a string.") - if not file_path.strip(): - raise ValueError("'file_path' cannot be an empty string.") + if not isinstance(file_path, (Path, str)): + raise TypeError("'file_path' must be either a string or 'Path' object.") + + # Ensure that path exists + file_path = Path(file_path) if isinstance(file_path, str) else file_path + if not file_path.exists() or file_path.is_dir(): + raise FileNotFoundError(f"'{file_path}' does not exist.") self.__file_path = file_path @property @@ -98,12 +109,12 @@ def encoding(self, encoding: str) -> None: raise TypeError("'encoding' must be a string.") self.__encoding = encoding - def fetch_data(self) -> pd.DataFrame: + def fetch_data(self) -> pl.DataFrame: """ - Fetch all data from the CSV file as a pandas DataFrame. + Fetch all data from the CSV file as a polars DataFrame. Returns: - pd.DataFrame: Data loaded from the CSV file. + pl.DataFrame: Data loaded from the CSV file. """ if self.dataframe is None: raise RuntimeError("No data loaded from the CSV file.") @@ -120,15 +131,15 @@ def get_columns(self) -> List[str]: raise RuntimeError("No data loaded from the CSV file.") return list(self.dataframe.columns) - def filter_data(self, query: str) -> pd.DataFrame: + def filter_data(self, query: str) -> pl.DataFrame: """ - Filter the CSV data using a pandas query string. + Filter the CSV data using a polars query string. Args: query (str): Query string to filter the data. Returns: - pd.DataFrame: Filtered data based on the query. + pl.DataFrame: Filtered data based on the query. Raises: RuntimeError: If no data is loaded. @@ -138,7 +149,7 @@ def filter_data(self, query: str) -> pd.DataFrame: raise RuntimeError("No data loaded from the CSV file.") try: - return self.dataframe.query(query) + return self.dataframe.sql(query=query) except Exception as e: raise ValueError(f"Invalid query: {query}") from e diff --git a/quipus/data_sources/dataframe_data_source.py b/quipus/data_sources/dataframe_data_source.py index 1b59670..c5bbcb3 100644 --- a/quipus/data_sources/dataframe_data_source.py +++ b/quipus/data_sources/dataframe_data_source.py @@ -1,50 +1,50 @@ from typing import List -import pandas as pd +import polars as pl class DataFrameDataSource: """ - Pandas DataFrame DataSource to manage data retrieval from DataFrames. + polars DataFrame DataSource to manage data retrieval from DataFrames. Attributes: - dataframe (pd.DataFrame): DataFrame containing the data. + dataframe (pl.DataFrame): DataFrame containing the data. """ - def __init__(self, dataframe: pd.DataFrame): + def __init__(self, dataframe: pl.DataFrame): self.dataframe = dataframe @property - def dataframe(self) -> pd.DataFrame: + def dataframe(self) -> pl.DataFrame: """ Get the DataFrame containing the data. Returns: - pd.DataFrame: DataFrame containing the data. + pl.DataFrame: DataFrame containing the data. """ return self.__dataframe @dataframe.setter - def dataframe(self, dataframe: pd.DataFrame) -> None: + def dataframe(self, dataframe: pl.DataFrame) -> None: """ Set the DataFrame containing the data. Args: - dataframe (pd.DataFrame): DataFrame containing the data. + dataframe (pl.DataFrame): DataFrame containing the data. Raises: - TypeError: If 'dataframe' is not a pandas DataFrame. + TypeError: If 'dataframe' is not a polars DataFrame. """ - if not isinstance(dataframe, pd.DataFrame): - raise TypeError("'dataframe' must be a pandas DataFrame.") + if not isinstance(dataframe, pl.DataFrame): + raise TypeError("'dataframe' must be a polars DataFrame.") self.__dataframe = dataframe - def fetch_data(self) -> pd.DataFrame: + def fetch_data(self) -> pl.DataFrame: """ Fetch data from the DataFrame. Returns: - pd.DataFrame: DataFrame containing the data. + pl.DataFrame: DataFrame containing the data. """ if self.dataframe is None: raise RuntimeError("No data loaded in the DataFrame.") @@ -61,7 +61,7 @@ def get_columns(self) -> List[str]: raise RuntimeError("No data loaded in the DataFrame.") return list(self.dataframe.columns) - def filter_data(self, query: str) -> pd.DataFrame: + def filter_data(self, query: str) -> pl.DataFrame: """ Filter the data in the DataFrame using a query. @@ -69,7 +69,7 @@ def filter_data(self, query: str) -> pd.DataFrame: query (str): Query to filter the data. Returns: - pd.DataFrame: Filtered DataFrame. + pl.DataFrame: Filtered DataFrame. Raises: RuntimeError: If no data is loaded in the DataFrame. @@ -86,7 +86,7 @@ def filter_data(self, query: str) -> pd.DataFrame: if query.strip() == "": raise ValueError("Query cannot be an empty string.") - return self.dataframe.query(query) + return self.dataframe.sql(query) def __str__(self) -> str: """ diff --git a/quipus/data_sources/xlsx_data_source.py b/quipus/data_sources/xlsx_data_source.py index 64fb0dc..33c1cfb 100644 --- a/quipus/data_sources/xlsx_data_source.py +++ b/quipus/data_sources/xlsx_data_source.py @@ -1,6 +1,7 @@ -from typing import Optional, List +from pathlib import Path +from typing import Union, Optional, List -import pandas as pd +import polars as pl class XLSXDataSource: @@ -8,35 +9,35 @@ class XLSXDataSource: XLSX DataSource class to manage data retrieval from Excel (.xlsx) files. Attributes: - file_path (str): Path to the Excel file. + file_path (Union[Path, str]): Path to the Excel file. sheet_name (str): Name of the sheet to load from the Excel file. - dataframe (Optional[pd.DataFrame]): Loaded data as a pandas DataFrame. + dataframe (Optional[pl.DataFrame]): Loaded data as a polars DataFrame. """ - def __init__(self, file_path: str, sheet_name: str): + def __init__(self, file_path: Union[Path, str], sheet_name: str): self.file_path = file_path self.sheet_name = sheet_name - self.dataframe: Optional[pd.DataFrame] = None + self.dataframe: Optional[pl.DataFrame] = None self.__load_data() def __load_data(self) -> None: """ - Load data from the Excel file into a pandas DataFrame. + Load data from the Excel file into a polars DataFrame. """ - self.dataframe = pd.read_excel(self.file_path, sheet_name=self.sheet_name) + self.dataframe = pl.read_excel(self.file_path, sheet_name=self.sheet_name) @property - def file_path(self) -> str: + def file_path(self) -> Union[Path, str]: """ Get the path to the Excel file. Returns: - str: Path to the Excel file. + Union[Path, str]: Path to the Excel file. """ return self.__file_path @file_path.setter - def file_path(self, file_path: str) -> None: + def file_path(self, file_path: Union[Path, str]) -> None: """ Set the path to the Excel file. @@ -47,11 +48,14 @@ def file_path(self, file_path: str) -> None: TypeError: If 'file_path' is not a string. ValueError: If 'file_path' is an empty string. """ - if not isinstance(file_path, str): - raise TypeError("'file_path' must be a string.") - if not file_path.strip(): - raise ValueError("'file_path' cannot be an empty string.") - self.__file_path = file_path + if not isinstance(file_path, (Path, str)): + raise TypeError("'file_path' must be either a string or 'Path' object.") + + # Ensure if path exists + path = Path(file_path) if isinstance(file_path, str) else file_path + if not path.exists() or path.is_dir(): + raise FileNotFoundError(f"'{file_path}' does not exist.") + self.__file_path = path @property def sheet_name(self) -> str: @@ -77,12 +81,12 @@ def sheet_name(self, sheet_name: str) -> None: raise TypeError("'sheet_name' must be a string.") self.__sheet_name = sheet_name - def fetch_data(self) -> pd.DataFrame: + def fetch_data(self) -> pl.DataFrame: """ - Fetch all data from the Excel sheet as a pandas DataFrame. + Fetch all data from the Excel sheet as a polars DataFrame. Returns: - pd.DataFrame: Data loaded from the Excel sheet. + pl.DataFrame: Data loaded from the Excel sheet. """ if self.dataframe is None: raise RuntimeError("No data loaded from the Excel file.") @@ -99,15 +103,15 @@ def get_columns(self) -> List[str]: raise RuntimeError("No data loaded from the Excel file.") return list(self.dataframe.columns) - def filter_data(self, query: str) -> pd.DataFrame: + def filter_data(self, query: str) -> pl.DataFrame: """ - Filter the Excel data using a pandas query string. + Filter the Excel data using a polars query string. Args: query (str): Query string to filter the data. Returns: - pd.DataFrame: Filtered data based on the query. + pl.DataFrame: Filtered data based on the query. Raises: RuntimeError: If no data is loaded. @@ -117,7 +121,7 @@ def filter_data(self, query: str) -> pd.DataFrame: raise RuntimeError("No data loaded from the Excel file.") try: - return self.dataframe.query(query) + return self.dataframe.sql(query) except Exception: raise ValueError("Invalid query provided.") diff --git a/quipus/models/certificate_factory.py b/quipus/models/certificate_factory.py index b25f12b..45a9b15 100644 --- a/quipus/models/certificate_factory.py +++ b/quipus/models/certificate_factory.py @@ -1,8 +1,13 @@ -import pandas as pd +from typing import TypeAlias, Union, Iterator, Tuple, Any, Dict, List + +import polars as pl from .certificate import Certificate +PolarsRow: TypeAlias = Union[Iterator[Tuple[Any, ...]], Iterator[Dict[str, Any]]] + + class CertificateFactory: """ Factory class to create Certificate objects @@ -12,12 +17,12 @@ class CertificateFactory: - create_certificates: create a list of Certificate objects from a DataFrame """ @staticmethod - def create_one_certificate(row: pd.Series) -> Certificate: + def create_one_certificate(row: PolarsRow) -> Certificate: """ Create a single Certificate object from a row in a DataFrame Args: - row (pd.Series): a row in a DataFrame containing the certificate data + row (PolarsRow): a row in a DataFrame containing the certificate data Returns: Certificate: a Certificate object created from the row @@ -32,16 +37,17 @@ def create_one_certificate(row: pd.Series) -> Certificate: ) @staticmethod - def create_certificates(df: pd.DataFrame) -> list[Certificate]: + def create_certificates(df: pl.DataFrame) -> List[Certificate]: """ Create a list of Certificate objects from a DataFrame Args: - df (pd.DataFrame): a DataFrame containing the certificate data + df (pl.DataFrame): a DataFrame containing the certificate data Returns: - list[Certificate]: a list of Certificate objects created from the DataFrame + List[Certificate]: a list of Certificate objects created from the DataFrame """ return [ - CertificateFactory.create_one_certificate(row) for _, row in df.iterrows() + CertificateFactory.create_one_certificate(row) + for row in df.iter_rows(named=True) ] diff --git a/quipus/services/template_manager.py b/quipus/services/template_manager.py index fd9da96..d499f1e 100644 --- a/quipus/services/template_manager.py +++ b/quipus/services/template_manager.py @@ -93,8 +93,10 @@ def from_source(self, source_type: Literal["csv"], **kwargs) -> Self: def from_csv(self, path_to_file: str) -> Self: csv_data_source = CSVDataSource(file_path=path_to_file) - self.data = csv_data_source.fetch_data().to_dict(orient="records") - + fetched_data = csv_data_source.fetch_data() + self.data = [] + for row in fetched_data.iter_rows(named=True): + self.data.append(row) return self def with_multiple_templates(self, templates: list[Template]): diff --git a/tests/test_certificate_factory.py b/tests/test_certificate_factory.py index b7add70..b758745 100644 --- a/tests/test_certificate_factory.py +++ b/tests/test_certificate_factory.py @@ -1,21 +1,19 @@ import pytest -import pandas as pd +import polars as pl from quipus import CertificateFactory, Certificate @pytest.fixture def sample_row(): - return pd.Series( - { - "completion_date": "2024-10-27", - "content": "Test content 1", - "entity": "Test entity 1", - "name": "Test name 1", - "duration": "Test duration 1", - "validity_checker": "https://example.com/check", - } - ) + return { + "completion_date": "2024-10-27", + "content": "Test content 1", + "entity": "Test entity 1", + "name": "Test name 1", + "duration": "Test duration 1", + "validity_checker": "https://example.com/check", + } @pytest.fixture @@ -31,7 +29,7 @@ def sample_dataframe(): "https://example.com/check2", ], } - return pd.DataFrame(data) + return pl.DataFrame(data) def test_create_one_certificate(sample_row): diff --git a/tests/test_csv_source.py b/tests/test_csv_source.py index 2c6e323..ee66dd3 100644 --- a/tests/test_csv_source.py +++ b/tests/test_csv_source.py @@ -1,4 +1,7 @@ +from pathlib import Path + import pytest +import polars as pl import pandas as pd from quipus import CSVDataSource @@ -8,11 +11,11 @@ def test_csv_data_source_valid_initialization(tmp_path): csv_file = tmp_path / "test.csv" csv_file.write_text("col1,col2\n1,2\n3,4") - data_source = CSVDataSource(file_path=str(csv_file)) + data_source = CSVDataSource(file_path=csv_file) - assert data_source.file_path == str(csv_file) + assert str(data_source.file_path) == str(csv_file) assert data_source.delimiter == "," - assert data_source.encoding == "utf-8" + assert data_source.encoding == "utf8" assert data_source.dataframe is not None @@ -22,7 +25,7 @@ def test_csv_data_source_invalid_file_path_type(): def test_csv_data_source_empty_file_path(): - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): CSVDataSource(file_path="") @@ -40,8 +43,8 @@ def test_csv_data_source_fetch_data(tmp_path): df = data_source.fetch_data() - expected_df = pd.DataFrame({"col1": [1, 3], "col2": [2, 4]}) - pd.testing.assert_frame_equal(df.reset_index(drop=True), expected_df) + expected_df = pl.DataFrame({"col1": [1, 3], "col2": [2, 4]}) + df.equals(expected_df) def test_csv_data_source_fetch_data_no_data(tmp_path): @@ -86,10 +89,10 @@ def test_csv_data_source_filter_data(tmp_path): data_source = CSVDataSource(file_path=str(csv_file)) - filtered_df = data_source.filter_data("col1 > 2") + filtered_df = data_source.filter_data("SELECT * FROM self WHERE col1 > 2") - expected_df = pd.DataFrame({"col1": [3, 5], "col2": [4, 6]}, index=[1, 2]) - pd.testing.assert_frame_equal(filtered_df, expected_df) + expected_df = pl.DataFrame({"col1": [3, 5], "col2": [4, 6]}) + filtered_df.equals(expected_df) def test_csv_data_source_filter_data_invalid_query(tmp_path): @@ -112,7 +115,7 @@ def test_csv_data_source_filter_data_no_data(tmp_path): data_source.dataframe = None with pytest.raises(RuntimeError, match="No data loaded from the CSV file."): - data_source.filter_data("col1 > 2") + data_source.filter_data("SELECT * FROM self WHERE col1 > 2") def test_csv_data_source_str(tmp_path): @@ -146,14 +149,16 @@ def test_csv_data_source_invalid_encoding(tmp_path): data_source.encoding = 123 -def test_csv_data_source_invalid_delimiter_init(): - csv_file = "test.csv" +def test_csv_data_source_invalid_delimiter_init(tmp_path): + csv_file = tmp_path / "test.csv" + csv_file.write_text("col1,col2\n1,2\n3,4") with pytest.raises(TypeError): CSVDataSource(file_path=csv_file, delimiter=123) -def test_csv_data_source_invalid_encoding_init(): - csv_file = "test.csv" +def test_csv_data_source_invalid_encoding_init(tmp_path): + csv_file = tmp_path / "test.csv" + csv_file.write_text("col1,col2\n1,2\n3,4") with pytest.raises(TypeError): CSVDataSource(file_path=csv_file, encoding=123) @@ -162,7 +167,7 @@ def test_csv_data_source_empty_csv(tmp_path): csv_file = tmp_path / "empty.csv" csv_file.write_text("") - with pytest.raises(pd.errors.EmptyDataError): + with pytest.raises(pl.exceptions.NoDataError): CSVDataSource(file_path=str(csv_file)) @@ -176,15 +181,5 @@ def test_csv_data_source_invalid_csv(tmp_path): df = data_source.fetch_data() assert df is not None - assert not df.empty + assert not df.is_empty() assert df.shape == (1, 7) - - -def test_csv_data_source_read_csv_exception(monkeypatch): - def mock_read_csv(*args, **kwargs): - raise pd.errors.ParserError("Mocked parser error") - - monkeypatch.setattr(pd, "read_csv", mock_read_csv) - - with pytest.raises(pd.errors.ParserError, match="Mocked parser error"): - CSVDataSource(file_path="any.csv") diff --git a/tests/test_dataframe_source.py b/tests/test_dataframe_source.py index 1ebe2bd..d9d31a2 100644 --- a/tests/test_dataframe_source.py +++ b/tests/test_dataframe_source.py @@ -1,4 +1,5 @@ import pytest +import polars as pl import pandas as pd from quipus.data_sources.dataframe_data_source import DataFrameDataSource @@ -6,7 +7,7 @@ @pytest.fixture def sample_dataframe(): - return pd.DataFrame( + return pl.DataFrame( { "A": [1, 2, 3, 4], "B": [10, 20, 30, 40], @@ -22,12 +23,12 @@ def dataframe_source(sample_dataframe): @pytest.fixture def dataframe_source_empty(): - return DataFrameDataSource(pd.DataFrame()) + return DataFrameDataSource(pl.DataFrame()) def test_fetch_data(dataframe_source, sample_dataframe): fetched_data = dataframe_source.fetch_data() - pd.testing.assert_frame_equal(fetched_data, sample_dataframe) + fetched_data.equals(sample_dataframe) def test_get_columns(dataframe_source): @@ -35,8 +36,8 @@ def test_get_columns(dataframe_source): def test_filter_data(dataframe_source): - filtered_data = dataframe_source.filter_data("A > 2") - expected_filtered_data = pd.DataFrame( + filtered_data = dataframe_source.filter_data("SELECT * FROM self WHERE A > 2") + expected_filtered_data = pl.DataFrame( { "A": [3, 4], "B": [30, 40], @@ -44,9 +45,7 @@ def test_filter_data(dataframe_source): } ) - pd.testing.assert_frame_equal( - filtered_data.reset_index(drop=True), expected_filtered_data - ) + filtered_data.equals(expected_filtered_data) def test_filter_data_invalid_query(dataframe_source): diff --git a/tests/test_xlsx_source.py b/tests/test_xlsx_source.py index 1b9a8b3..5aede65 100644 --- a/tests/test_xlsx_source.py +++ b/tests/test_xlsx_source.py @@ -1,16 +1,17 @@ import pytest +import polars as pl import pandas as pd from quipus import XLSXDataSource def test_xlsx_data_source_valid_initialization(tmp_path): xlsx_file = tmp_path / "test.xlsx" - data = pd.DataFrame({"col1": [1, 3], "col2": [2, 4]}) - data.to_excel(xlsx_file, index=False) + data = pl.DataFrame({"col1": [1, 3], "col2": [2, 4]}) + data.write_excel(xlsx_file) data_source = XLSXDataSource(file_path=str(xlsx_file), sheet_name="Sheet1") - assert data_source.file_path == str(xlsx_file) + assert str(data_source.file_path) == str(xlsx_file) assert data_source.sheet_name == "Sheet1" assert data_source.dataframe is not None @@ -19,7 +20,7 @@ def test_xlsx_data_source_invalid_file_path_type(): XLSXDataSource(file_path=123, sheet_name="Sheet1") def test_xlsx_data_source_empty_file_path(): - with pytest.raises(ValueError): + with pytest.raises(FileNotFoundError): XLSXDataSource(file_path="", sheet_name="Sheet1") def test_xlsx_data_source_file_not_found(): @@ -28,18 +29,18 @@ def test_xlsx_data_source_file_not_found(): def test_xlsx_data_source_fetch_data(tmp_path): xlsx_file = tmp_path / "test.xlsx" - data = pd.DataFrame({"col1": [1, 3], "col2": [2, 4]}) - data.to_excel(xlsx_file, index=False) + data = pl.DataFrame({"col1": [1, 3], "col2": [2, 4]}) + data.write_excel(xlsx_file) data_source = XLSXDataSource(file_path=str(xlsx_file), sheet_name="Sheet1") df = data_source.fetch_data() - pd.testing.assert_frame_equal(df.reset_index(drop=True), data) + df.equals(data) def test_xlsx_data_source_get_columns(tmp_path): xlsx_file = tmp_path / "test.xlsx" - data = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) - data.to_excel(xlsx_file, index=False) + data = pl.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + data.write_excel(xlsx_file) data_source = XLSXDataSource(file_path=str(xlsx_file), sheet_name="Sheet1") columns = data_source.get_columns() @@ -48,26 +49,26 @@ def test_xlsx_data_source_get_columns(tmp_path): def test_xlsx_data_source_filter_data(tmp_path): xlsx_file = tmp_path / "test.xlsx" - data = pd.DataFrame({"col1": [1, 3, 5], "col2": [2, 4, 6]}) - data.to_excel(xlsx_file, index=False) + data = pl.DataFrame({"col1": [1, 3, 5], "col2": [2, 4, 6]}) + data.write_excel(xlsx_file) data_source = XLSXDataSource(file_path=str(xlsx_file), sheet_name="Sheet1") - filtered_df = data_source.filter_data("col1 > 2") + filtered_df = data_source.filter_data("SELECT * FROM self WHERE col1 > 2") - expected_df = pd.DataFrame({"col1": [3, 5], "col2": [4, 6]}, index=[1, 2]) - pd.testing.assert_frame_equal(filtered_df, expected_df) + expected_df = pl.DataFrame({"col1": [3, 5], "col2": [4, 6]}) + filtered_df.equals(expected_df) def test_xlsx_data_source_invalid_sheet_name(tmp_path): xlsx_file = tmp_path / "test.xlsx" - data = pd.DataFrame({"col1": [1], "col2": [2]}) - data.to_excel(xlsx_file, index=False, sheet_name="Sheet1") + data = pl.DataFrame({"col1": [1], "col2": [2]}) + data.write_excel(xlsx_file, worksheet="Sheet1") with pytest.raises(ValueError): XLSXDataSource(file_path=str(xlsx_file), sheet_name="InvalidSheet") def test_xlsx_data_source_no_data_loaded(tmp_path): xlsx_file = tmp_path / "test.xlsx" - pd.DataFrame({"col1": [1], "col2": [2]}).to_excel(xlsx_file, index=False) + pl.DataFrame({"col1": [1], "col2": [2]}).write_excel(xlsx_file) data_source = XLSXDataSource(file_path=str(xlsx_file), sheet_name="Sheet1") data_source.dataframe = None @@ -77,8 +78,8 @@ def test_xlsx_data_source_no_data_loaded(tmp_path): def test_xlsx_data_source_invalid_query(tmp_path): xlsx_file = tmp_path / "test.xlsx" - data = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) - data.to_excel(xlsx_file, index=False) + data = pl.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + data.write_excel(xlsx_file) data_source = XLSXDataSource(file_path=str(xlsx_file), sheet_name="Sheet1") @@ -87,8 +88,8 @@ def test_xlsx_data_source_invalid_query(tmp_path): def test_xlsx_data_source_str(tmp_path): xlsx_file = tmp_path / "test.xlsx" - data = pd.DataFrame({"col1": [1], "col2": [2]}) - data.to_excel(xlsx_file, index=False) + data = pl.DataFrame({"col1": [1], "col2": [2]}) + data.write_excel(xlsx_file) data_source = XLSXDataSource(file_path=str(xlsx_file), sheet_name="Sheet1") expected_str = f"XLSXDataSource(file_path={str(xlsx_file)}, sheet_name=Sheet1)" From de6b82465dec5785519487498373c391da6dac21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20U=2E=20Alarc=C3=B3n?= Date: Fri, 1 Nov 2024 22:24:35 +0100 Subject: [PATCH 2/3] Delete quipus/models/certificate_factory.py Unused file. --- quipus/models/certificate_factory.py | 53 ---------------------------- 1 file changed, 53 deletions(-) delete mode 100644 quipus/models/certificate_factory.py diff --git a/quipus/models/certificate_factory.py b/quipus/models/certificate_factory.py deleted file mode 100644 index 45a9b15..0000000 --- a/quipus/models/certificate_factory.py +++ /dev/null @@ -1,53 +0,0 @@ -from typing import TypeAlias, Union, Iterator, Tuple, Any, Dict, List - -import polars as pl - -from .certificate import Certificate - - -PolarsRow: TypeAlias = Union[Iterator[Tuple[Any, ...]], Iterator[Dict[str, Any]]] - - -class CertificateFactory: - """ - Factory class to create Certificate objects - - Methods: - - create_one_certificate: create a single Certificate object from a pd.Series - - create_certificates: create a list of Certificate objects from a DataFrame - """ - @staticmethod - def create_one_certificate(row: PolarsRow) -> Certificate: - """ - Create a single Certificate object from a row in a DataFrame - - Args: - row (PolarsRow): a row in a DataFrame containing the certificate data - - Returns: - Certificate: a Certificate object created from the row - """ - return Certificate( - completion_date=row["completion_date"], - content=row["content"], - entity=row["entity"], - name=row["name"], - duration=row.get("duration", None), - validity_checker=row.get("validity_checker", None), - ) - - @staticmethod - def create_certificates(df: pl.DataFrame) -> List[Certificate]: - """ - Create a list of Certificate objects from a DataFrame - - Args: - df (pl.DataFrame): a DataFrame containing the certificate data - - Returns: - List[Certificate]: a list of Certificate objects created from the DataFrame - """ - return [ - CertificateFactory.create_one_certificate(row) - for row in df.iter_rows(named=True) - ] From dc6672fb8552a4595571896b5fe1f3e0e1f17823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20U=2E=20Alarc=C3=B3n?= Date: Fri, 1 Nov 2024 22:25:12 +0100 Subject: [PATCH 3/3] Delete tests/test_certificate_factory.py Unused file --- tests/test_certificate_factory.py | 65 ------------------------------- 1 file changed, 65 deletions(-) delete mode 100644 tests/test_certificate_factory.py diff --git a/tests/test_certificate_factory.py b/tests/test_certificate_factory.py deleted file mode 100644 index b758745..0000000 --- a/tests/test_certificate_factory.py +++ /dev/null @@ -1,65 +0,0 @@ -import pytest -import polars as pl - -from quipus import CertificateFactory, Certificate - - -@pytest.fixture -def sample_row(): - return { - "completion_date": "2024-10-27", - "content": "Test content 1", - "entity": "Test entity 1", - "name": "Test name 1", - "duration": "Test duration 1", - "validity_checker": "https://example.com/check", - } - - -@pytest.fixture -def sample_dataframe(): - data = { - "completion_date": ["2024-10-27", "2024-10-25"], - "content": ["Test content 1", "Test content 2"], - "entity": ["Test entity 1", "Test entity 2"], - "name": ["Test name 1", "Test name 2"], - "duration": ["Test duration 1", "Test duration 2"], - "validity_checker": [ - "https://example.com/check1", - "https://example.com/check2", - ], - } - return pl.DataFrame(data) - - -def test_create_one_certificate(sample_row): - certificate = CertificateFactory.create_one_certificate(sample_row) - assert isinstance(certificate, Certificate) - assert certificate.completion_date == "2024-10-27" - assert certificate.content == "Test content 1" - assert certificate.entity == "Test entity 1" - assert certificate.name == "Test name 1" - assert certificate.duration == "Test duration 1" - assert certificate.validity_checker == "https://example.com/check" - - -def test_create_certificates(sample_dataframe): - certificates = CertificateFactory.create_certificates(sample_dataframe) - - assert isinstance(certificates, list) - assert len(certificates) == len(sample_dataframe) - assert all(isinstance(cert, Certificate) for cert in certificates) - - assert certificates[0].completion_date == "2024-10-27" - assert certificates[0].content == "Test content 1" - assert certificates[0].entity == "Test entity 1" - assert certificates[0].name == "Test name 1" - assert certificates[0].duration == "Test duration 1" - assert certificates[0].validity_checker == "https://example.com/check1" - - assert certificates[1].completion_date == "2024-10-25" - assert certificates[1].content == "Test content 2" - assert certificates[1].entity == "Test entity 2" - assert certificates[1].name == "Test name 2" - assert certificates[1].duration == "Test duration 2" - assert certificates[1].validity_checker == "https://example.com/check2"