diff --git a/.github/workflows/docs_generation.yml b/.github/workflows/docs_generation.yml new file mode 100644 index 0000000..f86447c --- /dev/null +++ b/.github/workflows/docs_generation.yml @@ -0,0 +1,48 @@ +name: Generate Docs + +on: + push: + branches: + - main + paths: + - 'notebooks/**' + +jobs: + convert-and-publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Convert notebooks to markdown + run: | + for notebook in notebooks/*.ipynb; do + jupyter nbconvert --to markdown "$notebook" --output-dir docs/ + done + + - name: Update index page + run: | + echo "## Available Documentation:" >> docs/index.md + echo "" >> docs/index.md + for file in docs/*.md; do + if [ "$(basename "$file")" != "index.md" ]; then + echo "- [$(basename "$file" .md)]($(basename "$file"))" >> docs/index.md + fi + done + + - name: Commit and push changes + run: | + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action" + git add docs + git commit -m "Update documentation" || echo "No changes to commit" + git push \ No newline at end of file diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..94ce926 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,3 @@ +title: notebook_docs +description: a repo to demonstrate how to create GitHub pages documentation from Jupyter notebooks +theme: jekyll-theme-cayman \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..429c74a --- /dev/null +++ b/docs/index.md @@ -0,0 +1,12 @@ +--- +layout: default +title: Home +--- + +## Welcome to Notebook Documentation + +This site contains documentation for our jupyter notebooks. + +## Available Documentation + +(This list will be automatically populated) diff --git a/notebooks/testnb.ipynb b/notebooks/testnb.ipynb new file mode 100644 index 0000000..3360688 --- /dev/null +++ b/notebooks/testnb.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import cleaning process\n", + "\n", + "The following notebook is part of our import cleaning process.\n", + "This notebook accomplishes the following:\n", + "- Imports a CSV file\n", + "- Removes extra columns\n", + "- Converts strings to correct data types\n", + "- Saves in the cleansed directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pip install\n", + "%pip install pandas\n", + "\n", + "# Here we would import libraries\n", + "import sys\n", + "import pandas as pd\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import file from raw data folder\n", + "Here we import the file from the raw folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pseudo code for opening a file, importing a CSV, and loading it into pandas\n", + "\n", + "# Define the file path\n", + "file_path = 'path/to/your/csvfile.csv'\n", + "\n", + "# Use pandas to read the CSV file\n", + "df = pd.read_csv(file_path)\n", + "\n", + "# Display the first few rows of the dataframe\n", + "print(df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Removes extra columns\n", + "Removing the address and phone fields" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# BEGIN: Remove extra columns\n", + "\n", + "# List of columns to remove\n", + "columns_to_remove = ['address', 'phone']\n", + "\n", + "# Remove the specified columns\n", + "df_cleaned = df.drop(columns=columns_to_remove)\n", + "\n", + "# Display the first few rows of the cleaned dataframe\n", + "print(df_cleaned.head())\n", + "\n", + "# END: Remove extra columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set data types\n", + "Sets the correct datatypes for date and identity fields." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pseudo code for setting data types\n", + "\n", + "# Convert the 'date_field' to datetime\n", + "df_cleaned['date_field'] = pd.to_datetime(df_cleaned['date_field'])\n", + "\n", + "# Convert the 'identity_field' to numeric (integer)\n", + "df_cleaned['identity_field'] = pd.to_numeric(df_cleaned['identity_field'], errors='coerce')\n", + "\n", + "# Display the data types of the dataframe to verify changes\n", + "print(df_cleaned.dtypes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save new data to a cleansed directory\n", + "Write the cleansed data from Pandas to a new CSV file in the cleansed folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pseudo code to write the cleansed data to a new CSV file in the cleansed folder\n", + "\n", + "# Define the output file path\n", + "output_file_path = 'path/to/cleansed/folder/cleansed_data.csv'\n", + "\n", + "# Use pandas to write the dataframe to a CSV file\n", + "df_cleaned.to_csv(output_file_path, index=False)\n", + "\n", + "# Confirm the file has been written\n", + "print(f\"Cleansed data has been written to {output_file_path}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..268afb6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +nbconvert +jupyter \ No newline at end of file