Skip to content

Commit

Permalink
Merge pull request #1635 from cal-itp/move-files-runbook
Browse files Browse the repository at this point in the history
add notebook used to re-name a partition
  • Loading branch information
atvaccaro authored Aug 8, 2022
2 parents aed6d71 + 8de5c73 commit 761d0c1
Showing 1 changed file with 179 additions and 0 deletions.
179 changes: 179 additions & 0 deletions runbooks/change_partitions_runbook.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "74eb666f-6645-482f-8cda-018df4c06d84",
"metadata": {},
"outputs": [],
"source": [
"from calitp.storage import get_fs\n",
"fs = get_fs()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a6763d9c-bbac-47d3-9ec5-ee119905cc5e",
"metadata": {},
"outputs": [],
"source": [
"from tqdm.notebook import tqdm\n",
"files = fs.expand_path('gs://calitp-gtfs-schedule-raw/schedule/', recursive=True)\n",
"files = [file for file in files if \"ts\" in file and fs.stat(file)[\"type\"] != \"directory\" and \"2022-07-15\" not in file]\n",
"len(files)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6bbcf9d-b2ff-4e7b-a921-8db98c3bff37",
"metadata": {},
"outputs": [],
"source": [
"paths = [(path, *path.split(\"/\")) for path in files]\n",
"paths[:5]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a5cac31-d58f-48ef-862c-3ac08aba3f1f",
"metadata": {},
"outputs": [],
"source": [
"%pip install pendulum"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "90dea8da-d5ee-4d07-ae4b-22e24a045e57",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pendulum\n",
"from datetime import datetime\n",
"moves = []\n",
"for og_path, bucket, table, dt, base64url, time, filename in paths:\n",
" pdt = pendulum.parse(dt.replace(\"dt=\", \"\"), exact=True)\n",
" assert isinstance(pdt, pendulum.Date)\n",
" ptime = pendulum.parse(time.replace(\"time=\", \"\"), exact=True)\n",
" assert isinstance(ptime, pendulum.Time)\n",
" ts = pendulum.instance(datetime.combine(pdt, ptime))\n",
" assert isinstance(ts, pendulum.DateTime)\n",
" new_path = \"/\".join([bucket, table, dt, base64url, f\"ts={ts.to_iso8601_string()}\", filename])\n",
" moves.append((og_path, new_path))\n",
"moves[:5]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "461f52bc-465f-4088-b9d9-5a29607852f4",
"metadata": {},
"outputs": [],
"source": [
"for og_path, new_path in tqdm(moves):\n",
" #print(og_path, new_path)\n",
" #break\n",
" fs.mv(og_path, new_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a47c511-66ec-4fb6-9c8a-ed64041fc9ff",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"notices = {\n",
" filename: json.loads(fs.cat(f\"gs://{filename}\").decode())\n",
" for filename in tqdm(files)\n",
"}\n",
"notices[\"gtfs-data/schedule/processed/2022-06-27_101_0/validation_report.json\"].keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bfa6251b-ec2c-4980-9575-75bbf321fa24",
"metadata": {},
"outputs": [],
"source": [
"d = notices[\"gtfs-data/schedule/processed/2022-06-27_101_0/validation_report.json\"]\n",
"d[\"notices\"][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec6c0868-b155-4b9b-9554-3030d7fb6534",
"metadata": {},
"outputs": [],
"source": [
"all_keys = []\n",
"\n",
"for name, file in notices.items():\n",
" for notice in file[\"notices\"]:\n",
" for subnotice in notice[\"notices\"]:\n",
" all_keys.append((name, subnotice.keys()))\n",
"all_keys[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "011c4e96-f5fc-4210-a8ad-5914cb74694c",
"metadata": {},
"outputs": [],
"source": [
"[stuff for stuff in all_keys if 'columnName' in stuff[1]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff7e0c7f-e7ab-4744-ba3b-6006119137b7",
"metadata": {},
"outputs": [],
"source": [
"rts = fs.glob('gs://gtfs-data/rt/2022-06-28T14:00:39/**')\n",
"rts = [fs.info(file) for file in rts if file.endswith('url')]\n",
"sorted(rts, key=lambda f: f['size'], reverse=True)[:5]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "03756990-89b3-4679-9880-a9d7a30a8e01",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 761d0c1

Please sign in to comment.