geeksforgeeks_project1_swiggydataanalysis.py

# -*- coding: utf-8 -*-
"""GeeksforGeeks_Project1_SwiggyDataAnalysis.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1seW-A7Q3Xr44ADw-liYn8kPbBH2lCASS

# Project Name: Swiggy Restaurants Data Analysis

### Problem Statement:
1.   How many cities (including subregions) where Swiggy is having its restaurants listed?
2.   How many cities  (don't include subregions) where Swiggy is having their restaurants listed?
3.   The Subregion of Delhi with the maximum number of restaurants listed on Swiggy?
4.   Name the top 5 Most Expensive Cities in the Datasets.
5.   List out the top 5 Restaurants with Maximum & minimum ratings throughout the dataset.
6.   Name of top 5 cities with the highest number of restaurants listed.
7.   Top 10 cities as per the number of restaurants listed?
8.   Name the top 5 Most Popular Restaurants in Pune.
9.   Which SubRegion in Delhi is having the least expensive restaurant in terms of cost?
10.  Top 5 most popular restaurant chains in India?
11.  Which restaurant in Pune has the most number of people visiting?
12.  Top 10 Restaurants with Maximum Ratings in Banglore
13.  Top 10 Restaurant in Patna w.r.t rating

Dataset link: [Swiggy Data](https://drive.google.com/file/d/1MCnCNHJXxc97EYwIdBszOkEBusu9FpCa/view)
"""

from google.colab import drive
drive.mount('/content/drive')

!pip3 install itables
from itables import show
import numpy as np
from tabulate import tabulate

import json
import pandas as pd
pd.options.mode.chained_assignment = None

"""### Download data and cleansing data

Save data.json file in your local google drive using this [shareable link](https://drive.google.com/file/d/1ck3s3JSW-Sql5xpuvxFATJ6iDfHadkhp/view?usp=sharing).
"""

with open('./drive/MyDrive/data.json', 'r') as f:
    data = json.load(f)

# get all cities that are in the dataset
cities = list(data.keys())
print(len(cities))
# for some cities you notice that there are some cases where restaurants are in special sub-regions
for i in cities:
    sub = list(data[i].keys())
    # print(sub)
    # lets check all cities that do not have 'restaurants' as keys
#     if 'restaurants' not in sub:
#         print(sub)

df = pd.DataFrame() # Empty dataframe for the whole dataset
for i in cities:
    subregions = list(data[i].keys())
    # check for all cities with no sub-regions
    try:
        # get restaurant data for each city
        rest = pd.DataFrame(data[i]['restaurants']).T
        # add city column for each restaurant
        city = [i for x in range(len(rest.index))]
        rest['city'] = city
        # also add sub-region for each restaurant which is same as city
        rest['sub_region'] = city
        # Concatenate to main Dataframe
        df = pd.concat([df,rest])
    # check for all cities with sub-regions
    except:
        for r in subregions:
            # Convert the city-wise restaurant data to pandas Dataframe
            rest = pd.DataFrame(data[i][r]['restaurants']).T
            # City
            city = [i for x in range(len(rest.index))]
            # Add City column for each restaurants in that city
            rest['city'] = city
            # Sub-region
            subregion = [r for x in range(len(rest.index))]
            # add sub-region column for each restaurants in that city
            rest['sub_region'] = subregion
            df = pd.concat([df,rest])

# show(df.head(5))
tabulate(df.loc[:, df.columns != 'menu'].head(5), headers = 'keys', tablefmt = 'html')

# DATA CLEANING
# reset index as a column and rename column index and name
df.reset_index(level=0, inplace=True)
df = df.rename(columns={'index':'restaurant_id', 'name':'restaurant'})
# print(type(df['restaurant_id']))
df.drop_duplicates(subset=['restaurant_id'], keep='first', inplace = True)
#df.set_index('id', inplace=True)
df.shape

# show(df.head(5))
tabulate(df.loc[:, df.columns != 'menu'].head(5), headers = 'keys', tablefmt = 'html')

# Clean data by dropping price and type
df = df.drop(['type','price'], axis =1)
df.dropna(inplace = True, axis = 0)
df.reset_index(inplace = True)
df.drop(['index'], axis = 1, inplace = True)
df.shape

df.city.replace(['Noida-1','Noida'],'Noida',inplace=True)
# show(df.head(5))
tabulate(df.loc[:, df.columns != 'menu'].head(5), headers = 'keys', tablefmt = 'html')

# before we go ahead we need to filter cost column by removing ₹ sign from the cost and converting it to int
df['cost'] = df['cost'].str.replace('₹', '')
df['cost'] = df['cost'].str.replace('\s', '', regex=True)
df['cost'] = pd.to_numeric(df['cost'], errors='coerce')
df['cost'] = df['cost'].replace(np.nan, 0, regex=True)
# remove restaurants with NA values
df = df[df['restaurant'] != 'NA']

# show(df.head(5))
tabulate(df.loc[:, df.columns != 'menu'].head(5), headers = 'keys', tablefmt = 'html')

"""### Number of cities(including sub-regions) that have Swiggy restaurants"""

# given cleaned swiggy data in df get count of unique cities
count_of_cities = df['city'].nunique()
print("Number of cities (including subregions) where Swiggy is having its restaurants listed are: ", count_of_cities)

"""### Number of cities(not including sub-regions) that have Swiggy restaurants"""

# given cleaned swiggy data in df get count of subregions as they include cities as well
count_of_subregions = df['sub_region'].nunique()
print("Number of cities (not including subregions) where Swiggy is having its restaurants listed are: ", count_of_subregions)

"""### Delhi subregion with maximum number of restaurants listed on Swiggy"""

# get number of restaurants listed for Delhi
delhi_rest = df[df['city'] == 'Delhi']
# next find count of all resataurants grouping by sub-region of Delhi and then sort them in ascending order.
# select first entry as it is the maximum
M = delhi_rest.groupby('sub_region')['sub_region'].count().sort_values(ascending=False).head(1)
print("The subregion of Delhi with the maximum restaurants listed on Swiggy is ->")
M.rename('no_of_restaurants',inplace=True)
# show(M)
# tabulate(M, headers = 'keys', tablefmt = 'html')
print(M)

"""### Top 5 Most Expensive Cities in the dataset"""

# next find sum of all costs of restaurants grouping by city and then sort them in ascending order.
# select first 5 entry as it is the top 5
max_expensive = df.groupby('city')['cost'].sum().sort_values(ascending=False).head(5)
print("The top 5 most expensive cities listed on Swiggy are -> ")
max_expensive.rename('TotalCost(in ₹)',inplace=True)
# show(max_expensive)
# tabulate(max_expensive, headers = 'keys', tablefmt = 'html')
print(max_expensive)

"""### Top 5 Restaurants with Maximum & Ninimum ratings throughout the dataset"""

def rating_rank(df):
  rating_rank = []
  for i in df.index:
      z = df['rating_count'][i]
      if z == 'Too Few Ratings':
          rating_rank.append(0)
      elif z == '20+ ratings':
          rating_rank.append(1)
      elif z == '50+ ratings':
          rating_rank.append(2)
      elif z == '100+ ratings':
          rating_rank.append(3)
      elif z == '500+ ratings':
          rating_rank.append(4)
      elif z == '1K+ ratings':
          rating_rank.append(5)
      else:
          rating_rank.append(-1)
  # df['rating_rank'] = rating_rank
  df.insert(4,'rating_rank',rating_rank, True)

# first exclude all restaurants with too few ratings
df_ratings = df[df['rating_count']!='Too Few Ratings']
# convert str column to float
df_ratings['rating'] = pd.to_numeric(df_ratings['rating'], errors='coerce')
rating_rank(df_ratings)
# next find max rating of all resataurants grouping by restaurant name and then sort them in ascending and descending order.
# select first 5 entry as it is the max for ascending and min for descending order.
max_ratings = df_ratings.sort_values(by=['rating','rating_rank'],ascending=False).head(5)
min_ratings = df_ratings.sort_values(by=['rating','rating_rank'],ascending=True).head(5)
print("The top 5 most rated restaurants listed on Swiggy with 5.0 rating are -> ")
# show(max_ratings[['restaurant','rating','rating_count']])
tabulate(max_ratings[['restaurant','rating','rating_count']], headers = 'keys', tablefmt = 'html')

print("*Note there are several restaurants with rating 5.0 other than the 5 mentioned and hence rating_count is also included to determine maximum rating restaurant.")
print("The top 5 least rated restaurants listed on Swiggy with lowest rating are -> ")
# show(min_ratings[['restaurant','rating','rating_count']])
tabulate(min_ratings[['restaurant','rating','rating_count']], headers = 'keys', tablefmt = 'html')

"""### Top 5 cities with the highest number of restaurants listed."""

top5_cities = df.groupby('city')['restaurant_id'].count().sort_values(ascending=False).head(5)
print("The top 5 cities with highest number of restaurants listed on Swiggy are -> ")
top5_cities.rename('Total_Restaurants', inplace=True)
# show(top5_cities)
# tabulate(top5_cities, headers = 'keys', tablefmt = 'html')
print(top5_cities)

"""### Top 10 cities with the highest number of restaurants listed."""

top10_cities = df.groupby('city')['restaurant_id'].count().sort_values(ascending=False).head(10)
print("The top 10 cities with highest number of restaurants listed on Swiggy are -> ")
top10_cities.rename('Total_Restaurants', inplace=True)
# show(top10_cities)
# tabulate(top10_cities, headers = 'keys', tablefmt = 'html')
print(top10_cities)

"""### Top 5 Most Popular Restaurants in Pune"""

# get all all restaurants in Pune
pune_rest = df[df['city'] == 'Pune']
# find all the unique rating counts
rating_unique = pune_rest['rating_count'].unique()
print(rating_unique)
# Assign a new column based on rating_count rank with
# 1K+ being the highest and NA being the lowest
rating_rank(pune_rest)
# sort the table with newly created rating-rank and rank with top 5 pop values with highest rating first
pop_rest = pune_rest.sort_values(by=['rating_rank','rating'],ascending=False).head(5)
print("The top 5 most popular restaurants in Pune listed on Swiggy are -> ")
# show(pop_rest[['restaurant','rating','rating_count']])
tabulate(pop_rest[['restaurant','rating','rating_count']], headers = 'keys', tablefmt = 'html')

"""### Sub region of Least expensive restaurant in Delhi"""

# previously got all restaurants in delhi saved in delhi_rest
delhi_rest = df[df['city'] == 'Delhi']
# remove reataurants with NA values
delhi_rest = delhi_rest[delhi_rest['restaurant'] != 'NA']
# to make sure we have good values choose restaurants with some rating values.
delhi_rest = delhi_rest[delhi_rest['rating'] != '--']
M = delhi_rest.sort_values(by=['cost'],ascending=True).head(1)
# show(M[['restaurant','sub_region','cost']])
tabulate(M[['restaurant','sub_region','cost']], headers = 'keys', tablefmt = 'html')

print("The least expensive restaurant in Delhi listed on Swiggy with cost of two (₹{}) is {} and lies in {} subregion.".format(list(M['cost'])[0],list(M['restaurant'])[0],list(M['sub_region'])[0]))

"""### Top 5 most popular restaurant chains in India"""

# First let's divide restaurants into chains and single-outlets restaurants
rest_outlet = df['restaurant'].value_counts() # gives us count of all unique restaurant outlets
# print(rest)
single_outlet = rest_outlet[rest_outlet == 1]
# print(single_outlet)
chains = rest_outlet[rest_outlet >= 2]
# top 5 most popular
print("Top 5 most popular restaurants chains in India are ->")
chains.rename('No_of_outlets',inplace=True)
# show(chains.head(5))
# tabulate(chains.head(5), headers = 'keys', tablefmt = 'html')
print(chains.head(5))

"""### Restaurant with most number of people visited in city of Pune"""

# this question is similar to Q8
# in our case most visited would be probably be correlated to most reviewed.
# get all all restaurants in Pune
pune_rest = df[df['city'] == 'Pune']
# find all the unique rating counts
rating_unique = pune_rest['rating_count'].unique()
print(rating_unique)
# From the list above assign a new column based on rating_count rank with
# 1K+ being the highest and NA being the lowest
rating_rank(pune_rest)
# sort the table with newly created rating-rank and rank with top 5 pop values with highest rating first
pop_rest = pune_rest.sort_values(by=['rating_rank','rating'],ascending=False).head(1)
print("In the city of Pune the most visited restaurant is ->")
# show(pop_rest[['restaurant','rating_count','rating']])
tabulate(pop_rest[['restaurant','rating_count','rating']], headers = 'keys', tablefmt = 'html')

"""### Top 10 Restaurants with Maximum Ratings in Bangalore"""

# first exclude all restaurants with too few ratings
df_ratings = df[df['rating_count']!='Too Few Ratings']
# convert str column to float
df_ratings['rating'] = pd.to_numeric(df_ratings['rating'], errors='coerce')
# get all all restaurants in Bangalore
bang_rest = df_ratings[df_ratings['city'] == 'Bangalore']
rating_rank(bang_rest)
max_ratings = bang_rest.sort_values(by=['rating','rating_rank'],ascending=False).head(10)
print("Top 10 Restaurants with Maximum Ratings in Bangalore are ->")
# show(max_ratings[['restaurant','rating','rating_count']])
tabulate(max_ratings[['restaurant','rating','rating_count']], headers = 'keys', tablefmt = 'html')

"""### Top 10 Restaurant in Patna w.r.t rating"""

# first exclude all restaurants with too few ratings
df_ratings = df[df['rating_count']!='Too Few Ratings']
# convert str column to float
df_ratings['rating'] = pd.to_numeric(df_ratings['rating'], errors='coerce')
patna_rest = df_ratings[df_ratings['city'] == 'Patna']
rating_rank(patna_rest)
max_ratings = patna_rest.sort_values(by=['rating','rating_rank'],ascending=False).head(10)
print("Top 10 Restaurants with Maximum Ratings in Patna are ->")
# show(max_ratings[['restaurant','rating','rating_count']])
tabulate(max_ratings[['restaurant','rating','rating_count']], headers = 'keys', tablefmt = 'html')