build-grok-latest.sh

#!/bin/bash

# URL for Google Sheets download as XLSX
URL="https://docs.google.com/spreadsheets/d/13U87Sm6e6Fh1SWipl_y_Hi6r2zxfYLyTwjMMZoLjMl4/export?format=xlsx"

# Temporary and output files
TIMESTAMP=$(date +%s)
TEMP_XLSX="temp_$TIMESTAMP.xlsx"
PYTHON_SCRIPT="convert_xlsx_to_json.py"
DOWNLOAD_LOG="download_$TIMESTAMP.log"

# Download the XLSX file with verbose output
echo "Attempting to download from $URL"
curl -f --location -v "$URL" -o "$TEMP_XLSX" 2>&1 | tee "$DOWNLOAD_LOG"

# Check if download was successful
if [ $? -ne 0 ]; then
    echo "Failed to download XLSX. Check $DOWNLOAD_LOG for details."
    cat "$DOWNLOAD_LOG"
    exit 1
fi

# Verify the file extension
if [[ "$TEMP_XLSX" != *.xlsx ]]; then
    echo "Downloaded file does not have .xlsx extension. Please check the download URL or the file."
    exit 1
fi

# Check if the file exists
if [ ! -f "$TEMP_XLSX" ]; then
    echo "File $TEMP_XLSX does not exist after download attempt."
    exit 1
fi

echo "File $TEMP_XLSX exists."

# List directory contents to verify file existence
ls -la

# Check file size for verification
echo "File size of $TEMP_XLSX:"
ls -l "$TEMP_XLSX"

# Ensure the file is readable by the current user
chmod +r "$TEMP_XLSX"

# Create Python script to handle XLSX to JSON conversion
cat << EOF > "$PYTHON_SCRIPT"
import subprocess
import sys
import os
import json
from collections import defaultdict

try:
    import openpyxl
except ImportError:
    print("openpyxl not found. Attempting to install...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "openpyxl"])
    import openpyxl

# Check if the file exists and is readable
if not os.path.exists("${TEMP_XLSX}") or not os.access("${TEMP_XLSX}", os.R_OK):
    print(f"File '{TEMP_XLSX}' does not exist or is not readable.")
    sys.exit(1)

try:
    # Load the workbook
    wb = openpyxl.load_workbook("${TEMP_XLSX}")

    countries_data = {
        "countries": []
    }

    for sheet in wb.worksheets:
        sheet_data = {
            "level1": []
        }
        level_hierarchy = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
        country_counter = 0

        # Skip the first row which contains headers
        for row in sheet.iter_rows(min_row=2, values_only=True):
            # Only take the first 6 columns (A-F)
            name1, name2, name3, name4, name5, uid = row[:6]
            
            # Clean up and ensure string types
            name1 = str(name1).replace('=', '') if name1 else None
            name2 = str(name2).replace('=', '') if name2 else None
            name3 = str(name3).replace('=', '') if name3 else None
            name4 = str(name4).replace('=', '') if name4 else None
            name5 = str(name5).replace('=', '') if name5 else None

            # Clean up UID
            if not uid or not isinstance(uid, str) or '-' not in uid:
                country_counter += 1
                uid = f"{country_counter:03d}-0001-00000-000000-0000000-00000000"
            else:
                uid_parts = uid.split('-')
                if len(uid_parts) != 6:
                    uid_parts = ["{:03d}".format(country_counter), "0001", "00000", "000000", "0000000", "00000000"]
            
            # Building the hierarchy with UID
            if name1:
                current_level1 = level_hierarchy[name1]
                current_level1["uid"] = '-'.join(uid_parts)
                uid_parts[1] = "0001"
                uid_parts[2] = "00000"
                uid_parts[3] = "000000"
                uid_parts[4] = "0000000"
                uid_parts[5] = "00000000"
            if name2:
                current_level2 = current_level1[name2]
                uid_parts[1] = f"{int(uid_parts[1]) + 1:04d}"
                current_level2["uid"] = '-'.join(uid_parts)
            if name3:
                current_level3 = current_level2[name3]
                uid_parts[2] = f"{int(uid_parts[2]) + 1:05d}"
                current_level3["uid"] = '-'.join(uid_parts)
            if name4:
                if not isinstance(current_level3, dict):
                    current_level3 = {}
                current_level4 = current_level3[name4]
                if not isinstance(current_level4, dict):
                    current_level4 = {}
                uid_parts[3] = f"{int(uid_parts[3]) + 1:06d}"
                current_level4["uid"] = '-'.join(uid_parts)
            if name5:
                if not isinstance(current_level4, dict):
                    current_level4 = {}
                if "level5" not in current_level4:
                    current_level4["level5"] = []
                uid_parts[5] = f"{int(uid_parts[5]) + 1:08d}"
                current_level4["level5"].append({"name": name5, "uid": '-'.join(uid_parts)})

        # Convert defaultdict to regular dict for JSON serialization
        for name1, level2 in level_hierarchy.items():
            level1_item = {
                "name": name1,
                "uid": level2["uid"],
                "level2": []
            }
            for name2, level3 in level2.items():
                if isinstance(level3, dict):
                    level2_item = {
                        "name": name2,
                        "uid": level3["uid"],
                        "level3": []
                    }
                    for name3, level4 in level3.items():
                        if isinstance(level4, dict):
                            level3_item = {
                                "name": name3,
                                "uid": level4["uid"],
                                "level4": []
                            }
                            for name4, level5 in level4.items():
                                if isinstance(level5, dict) and "level5" in level5:
                                    level4_item = {
                                        "name": name4,
                                        "uid": level5["uid"] if "uid" in level5 else None,
                                        "level5": level5["level5"]
                                    }
                                    level3_item["level4"].append(level4_item)
                                else:
                                    level3_item["level4"].append({
                                        "name": name4,
                                        "uid": level5["uid"] if isinstance(level5, dict) else None,
                                        "level5": []
                                    })
                            level2_item["level3"].append(level3_item)
                    level1_item["level2"].append(level2_item)
            sheet_data["level1"].append(level1_item)

        # Write JSON for each sheet with lowercase filename
        json_filename = f"{sheet.title.lower()}.json"
        with open(json_filename, 'w') as json_file:
            json.dump(sheet_data, json_file, indent=2)

        # Append to countries_data for data.json
        countries_data["countries"].append({
            "name": sheet.title,
            "filename": json_filename
        })

    # Write the mapping to data.json
    with open("data.json", 'w') as data_file:
        json.dump(countries_data, data_file, indent=2)

    print("JSON files have been created for each tab with corresponding lowercase names.")
    print("data.json has been updated to map all countries to their JSON files.")

except openpyxl.utils.exceptions.InvalidFileException as e:
    print(f"Error reading XLSX file: {e}")
    sys.exit(1)

EOF

# Run Python script
python "$PYTHON_SCRIPT"

# Clean up temporary files
rm "$TEMP_XLSX" "$DOWNLOAD_LOG" "$PYTHON_SCRIPT"