diff --git a/runbooks/change_partitions_runbook.ipynb b/runbooks/change_partitions_runbook.ipynb new file mode 100644 index 0000000000..b8e2873646 --- /dev/null +++ b/runbooks/change_partitions_runbook.ipynb @@ -0,0 +1,179 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "74eb666f-6645-482f-8cda-018df4c06d84", + "metadata": {}, + "outputs": [], + "source": [ + "from calitp.storage import get_fs\n", + "fs = get_fs()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6763d9c-bbac-47d3-9ec5-ee119905cc5e", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.notebook import tqdm\n", + "files = fs.expand_path('gs://calitp-gtfs-schedule-raw/schedule/', recursive=True)\n", + "files = [file for file in files if \"ts\" in file and fs.stat(file)[\"type\"] != \"directory\" and \"2022-07-15\" not in file]\n", + "len(files)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6bbcf9d-b2ff-4e7b-a921-8db98c3bff37", + "metadata": {}, + "outputs": [], + "source": [ + "paths = [(path, *path.split(\"/\")) for path in files]\n", + "paths[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a5cac31-d58f-48ef-862c-3ac08aba3f1f", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install pendulum" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90dea8da-d5ee-4d07-ae4b-22e24a045e57", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pendulum\n", + "from datetime import datetime\n", + "moves = []\n", + "for og_path, bucket, table, dt, base64url, time, filename in paths:\n", + " pdt = pendulum.parse(dt.replace(\"dt=\", \"\"), exact=True)\n", + " assert isinstance(pdt, pendulum.Date)\n", + " ptime = pendulum.parse(time.replace(\"time=\", \"\"), exact=True)\n", + " assert isinstance(ptime, pendulum.Time)\n", + " ts = pendulum.instance(datetime.combine(pdt, ptime))\n", + " assert isinstance(ts, pendulum.DateTime)\n", + " new_path = \"/\".join([bucket, table, dt, base64url, f\"ts={ts.to_iso8601_string()}\", filename])\n", + " moves.append((og_path, new_path))\n", + "moves[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "461f52bc-465f-4088-b9d9-5a29607852f4", + "metadata": {}, + "outputs": [], + "source": [ + "for og_path, new_path in tqdm(moves):\n", + " #print(og_path, new_path)\n", + " #break\n", + " fs.mv(og_path, new_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a47c511-66ec-4fb6-9c8a-ed64041fc9ff", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "notices = {\n", + " filename: json.loads(fs.cat(f\"gs://{filename}\").decode())\n", + " for filename in tqdm(files)\n", + "}\n", + "notices[\"gtfs-data/schedule/processed/2022-06-27_101_0/validation_report.json\"].keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfa6251b-ec2c-4980-9575-75bbf321fa24", + "metadata": {}, + "outputs": [], + "source": [ + "d = notices[\"gtfs-data/schedule/processed/2022-06-27_101_0/validation_report.json\"]\n", + "d[\"notices\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec6c0868-b155-4b9b-9554-3030d7fb6534", + "metadata": {}, + "outputs": [], + "source": [ + "all_keys = []\n", + "\n", + "for name, file in notices.items():\n", + " for notice in file[\"notices\"]:\n", + " for subnotice in notice[\"notices\"]:\n", + " all_keys.append((name, subnotice.keys()))\n", + "all_keys[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "011c4e96-f5fc-4210-a8ad-5914cb74694c", + "metadata": {}, + "outputs": [], + "source": [ + "[stuff for stuff in all_keys if 'columnName' in stuff[1]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff7e0c7f-e7ab-4744-ba3b-6006119137b7", + "metadata": {}, + "outputs": [], + "source": [ + "rts = fs.glob('gs://gtfs-data/rt/2022-06-28T14:00:39/**')\n", + "rts = [fs.info(file) for file in rts if file.endswith('url')]\n", + "sorted(rts, key=lambda f: f['size'], reverse=True)[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03756990-89b3-4679-9880-a9d7a30a8e01", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}