-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeeksforgeeks_project1_swiggydataanalysis.py
309 lines (260 loc) · 13.5 KB
/
geeksforgeeks_project1_swiggydataanalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# -*- coding: utf-8 -*-
"""GeeksforGeeks_Project1_SwiggyDataAnalysis.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1seW-A7Q3Xr44ADw-liYn8kPbBH2lCASS
# Project Name: Swiggy Restaurants Data Analysis
### Problem Statement:
1. How many cities (including subregions) where Swiggy is having its restaurants listed?
2. How many cities (don't include subregions) where Swiggy is having their restaurants listed?
3. The Subregion of Delhi with the maximum number of restaurants listed on Swiggy?
4. Name the top 5 Most Expensive Cities in the Datasets.
5. List out the top 5 Restaurants with Maximum & minimum ratings throughout the dataset.
6. Name of top 5 cities with the highest number of restaurants listed.
7. Top 10 cities as per the number of restaurants listed?
8. Name the top 5 Most Popular Restaurants in Pune.
9. Which SubRegion in Delhi is having the least expensive restaurant in terms of cost?
10. Top 5 most popular restaurant chains in India?
11. Which restaurant in Pune has the most number of people visiting?
12. Top 10 Restaurants with Maximum Ratings in Banglore
13. Top 10 Restaurant in Patna w.r.t rating
Dataset link: [Swiggy Data](https://drive.google.com/file/d/1MCnCNHJXxc97EYwIdBszOkEBusu9FpCa/view)
"""
from google.colab import drive
drive.mount('/content/drive')
!pip3 install itables
from itables import show
import numpy as np
from tabulate import tabulate
import json
import pandas as pd
pd.options.mode.chained_assignment = None
"""### Download data and cleansing data
Save data.json file in your local google drive using this [shareable link](https://drive.google.com/file/d/1ck3s3JSW-Sql5xpuvxFATJ6iDfHadkhp/view?usp=sharing).
"""
with open('./drive/MyDrive/data.json', 'r') as f:
data = json.load(f)
# get all cities that are in the dataset
cities = list(data.keys())
print(len(cities))
# for some cities you notice that there are some cases where restaurants are in special sub-regions
for i in cities:
sub = list(data[i].keys())
# print(sub)
# lets check all cities that do not have 'restaurants' as keys
# if 'restaurants' not in sub:
# print(sub)
df = pd.DataFrame() # Empty dataframe for the whole dataset
for i in cities:
subregions = list(data[i].keys())
# check for all cities with no sub-regions
try:
# get restaurant data for each city
rest = pd.DataFrame(data[i]['restaurants']).T
# add city column for each restaurant
city = [i for x in range(len(rest.index))]
rest['city'] = city
# also add sub-region for each restaurant which is same as city
rest['sub_region'] = city
# Concatenate to main Dataframe
df = pd.concat([df,rest])
# check for all cities with sub-regions
except:
for r in subregions:
# Convert the city-wise restaurant data to pandas Dataframe
rest = pd.DataFrame(data[i][r]['restaurants']).T
# City
city = [i for x in range(len(rest.index))]
# Add City column for each restaurants in that city
rest['city'] = city
# Sub-region
subregion = [r for x in range(len(rest.index))]
# add sub-region column for each restaurants in that city
rest['sub_region'] = subregion
df = pd.concat([df,rest])
# show(df.head(5))
tabulate(df.loc[:, df.columns != 'menu'].head(5), headers = 'keys', tablefmt = 'html')
# DATA CLEANING
# reset index as a column and rename column index and name
df.reset_index(level=0, inplace=True)
df = df.rename(columns={'index':'restaurant_id', 'name':'restaurant'})
# print(type(df['restaurant_id']))
df.drop_duplicates(subset=['restaurant_id'], keep='first', inplace = True)
#df.set_index('id', inplace=True)
df.shape
# show(df.head(5))
tabulate(df.loc[:, df.columns != 'menu'].head(5), headers = 'keys', tablefmt = 'html')
# Clean data by dropping price and type
df = df.drop(['type','price'], axis =1)
df.dropna(inplace = True, axis = 0)
df.reset_index(inplace = True)
df.drop(['index'], axis = 1, inplace = True)
df.shape
df.city.replace(['Noida-1','Noida'],'Noida',inplace=True)
# show(df.head(5))
tabulate(df.loc[:, df.columns != 'menu'].head(5), headers = 'keys', tablefmt = 'html')
# before we go ahead we need to filter cost column by removing ₹ sign from the cost and converting it to int
df['cost'] = df['cost'].str.replace('₹', '')
df['cost'] = df['cost'].str.replace('\s', '', regex=True)
df['cost'] = pd.to_numeric(df['cost'], errors='coerce')
df['cost'] = df['cost'].replace(np.nan, 0, regex=True)
# remove restaurants with NA values
df = df[df['restaurant'] != 'NA']
# show(df.head(5))
tabulate(df.loc[:, df.columns != 'menu'].head(5), headers = 'keys', tablefmt = 'html')
"""### Number of cities(including sub-regions) that have Swiggy restaurants"""
# given cleaned swiggy data in df get count of unique cities
count_of_cities = df['city'].nunique()
print("Number of cities (including subregions) where Swiggy is having its restaurants listed are: ", count_of_cities)
"""### Number of cities(not including sub-regions) that have Swiggy restaurants"""
# given cleaned swiggy data in df get count of subregions as they include cities as well
count_of_subregions = df['sub_region'].nunique()
print("Number of cities (not including subregions) where Swiggy is having its restaurants listed are: ", count_of_subregions)
"""### Delhi subregion with maximum number of restaurants listed on Swiggy"""
# get number of restaurants listed for Delhi
delhi_rest = df[df['city'] == 'Delhi']
# next find count of all resataurants grouping by sub-region of Delhi and then sort them in ascending order.
# select first entry as it is the maximum
M = delhi_rest.groupby('sub_region')['sub_region'].count().sort_values(ascending=False).head(1)
print("The subregion of Delhi with the maximum restaurants listed on Swiggy is ->")
M.rename('no_of_restaurants',inplace=True)
# show(M)
# tabulate(M, headers = 'keys', tablefmt = 'html')
print(M)
"""### Top 5 Most Expensive Cities in the dataset"""
# next find sum of all costs of restaurants grouping by city and then sort them in ascending order.
# select first 5 entry as it is the top 5
max_expensive = df.groupby('city')['cost'].sum().sort_values(ascending=False).head(5)
print("The top 5 most expensive cities listed on Swiggy are -> ")
max_expensive.rename('TotalCost(in ₹)',inplace=True)
# show(max_expensive)
# tabulate(max_expensive, headers = 'keys', tablefmt = 'html')
print(max_expensive)
"""### Top 5 Restaurants with Maximum & Ninimum ratings throughout the dataset"""
def rating_rank(df):
rating_rank = []
for i in df.index:
z = df['rating_count'][i]
if z == 'Too Few Ratings':
rating_rank.append(0)
elif z == '20+ ratings':
rating_rank.append(1)
elif z == '50+ ratings':
rating_rank.append(2)
elif z == '100+ ratings':
rating_rank.append(3)
elif z == '500+ ratings':
rating_rank.append(4)
elif z == '1K+ ratings':
rating_rank.append(5)
else:
rating_rank.append(-1)
# df['rating_rank'] = rating_rank
df.insert(4,'rating_rank',rating_rank, True)
# first exclude all restaurants with too few ratings
df_ratings = df[df['rating_count']!='Too Few Ratings']
# convert str column to float
df_ratings['rating'] = pd.to_numeric(df_ratings['rating'], errors='coerce')
rating_rank(df_ratings)
# next find max rating of all resataurants grouping by restaurant name and then sort them in ascending and descending order.
# select first 5 entry as it is the max for ascending and min for descending order.
max_ratings = df_ratings.sort_values(by=['rating','rating_rank'],ascending=False).head(5)
min_ratings = df_ratings.sort_values(by=['rating','rating_rank'],ascending=True).head(5)
print("The top 5 most rated restaurants listed on Swiggy with 5.0 rating are -> ")
# show(max_ratings[['restaurant','rating','rating_count']])
tabulate(max_ratings[['restaurant','rating','rating_count']], headers = 'keys', tablefmt = 'html')
print("*Note there are several restaurants with rating 5.0 other than the 5 mentioned and hence rating_count is also included to determine maximum rating restaurant.")
print("The top 5 least rated restaurants listed on Swiggy with lowest rating are -> ")
# show(min_ratings[['restaurant','rating','rating_count']])
tabulate(min_ratings[['restaurant','rating','rating_count']], headers = 'keys', tablefmt = 'html')
"""### Top 5 cities with the highest number of restaurants listed."""
top5_cities = df.groupby('city')['restaurant_id'].count().sort_values(ascending=False).head(5)
print("The top 5 cities with highest number of restaurants listed on Swiggy are -> ")
top5_cities.rename('Total_Restaurants', inplace=True)
# show(top5_cities)
# tabulate(top5_cities, headers = 'keys', tablefmt = 'html')
print(top5_cities)
"""### Top 10 cities with the highest number of restaurants listed."""
top10_cities = df.groupby('city')['restaurant_id'].count().sort_values(ascending=False).head(10)
print("The top 10 cities with highest number of restaurants listed on Swiggy are -> ")
top10_cities.rename('Total_Restaurants', inplace=True)
# show(top10_cities)
# tabulate(top10_cities, headers = 'keys', tablefmt = 'html')
print(top10_cities)
"""### Top 5 Most Popular Restaurants in Pune"""
# get all all restaurants in Pune
pune_rest = df[df['city'] == 'Pune']
# find all the unique rating counts
rating_unique = pune_rest['rating_count'].unique()
print(rating_unique)
# Assign a new column based on rating_count rank with
# 1K+ being the highest and NA being the lowest
rating_rank(pune_rest)
# sort the table with newly created rating-rank and rank with top 5 pop values with highest rating first
pop_rest = pune_rest.sort_values(by=['rating_rank','rating'],ascending=False).head(5)
print("The top 5 most popular restaurants in Pune listed on Swiggy are -> ")
# show(pop_rest[['restaurant','rating','rating_count']])
tabulate(pop_rest[['restaurant','rating','rating_count']], headers = 'keys', tablefmt = 'html')
"""### Sub region of Least expensive restaurant in Delhi"""
# previously got all restaurants in delhi saved in delhi_rest
delhi_rest = df[df['city'] == 'Delhi']
# remove reataurants with NA values
delhi_rest = delhi_rest[delhi_rest['restaurant'] != 'NA']
# to make sure we have good values choose restaurants with some rating values.
delhi_rest = delhi_rest[delhi_rest['rating'] != '--']
M = delhi_rest.sort_values(by=['cost'],ascending=True).head(1)
# show(M[['restaurant','sub_region','cost']])
tabulate(M[['restaurant','sub_region','cost']], headers = 'keys', tablefmt = 'html')
print("The least expensive restaurant in Delhi listed on Swiggy with cost of two (₹{}) is {} and lies in {} subregion.".format(list(M['cost'])[0],list(M['restaurant'])[0],list(M['sub_region'])[0]))
"""### Top 5 most popular restaurant chains in India"""
# First let's divide restaurants into chains and single-outlets restaurants
rest_outlet = df['restaurant'].value_counts() # gives us count of all unique restaurant outlets
# print(rest)
single_outlet = rest_outlet[rest_outlet == 1]
# print(single_outlet)
chains = rest_outlet[rest_outlet >= 2]
# top 5 most popular
print("Top 5 most popular restaurants chains in India are ->")
chains.rename('No_of_outlets',inplace=True)
# show(chains.head(5))
# tabulate(chains.head(5), headers = 'keys', tablefmt = 'html')
print(chains.head(5))
"""### Restaurant with most number of people visited in city of Pune"""
# this question is similar to Q8
# in our case most visited would be probably be correlated to most reviewed.
# get all all restaurants in Pune
pune_rest = df[df['city'] == 'Pune']
# find all the unique rating counts
rating_unique = pune_rest['rating_count'].unique()
print(rating_unique)
# From the list above assign a new column based on rating_count rank with
# 1K+ being the highest and NA being the lowest
rating_rank(pune_rest)
# sort the table with newly created rating-rank and rank with top 5 pop values with highest rating first
pop_rest = pune_rest.sort_values(by=['rating_rank','rating'],ascending=False).head(1)
print("In the city of Pune the most visited restaurant is ->")
# show(pop_rest[['restaurant','rating_count','rating']])
tabulate(pop_rest[['restaurant','rating_count','rating']], headers = 'keys', tablefmt = 'html')
"""### Top 10 Restaurants with Maximum Ratings in Bangalore"""
# first exclude all restaurants with too few ratings
df_ratings = df[df['rating_count']!='Too Few Ratings']
# convert str column to float
df_ratings['rating'] = pd.to_numeric(df_ratings['rating'], errors='coerce')
# get all all restaurants in Bangalore
bang_rest = df_ratings[df_ratings['city'] == 'Bangalore']
rating_rank(bang_rest)
max_ratings = bang_rest.sort_values(by=['rating','rating_rank'],ascending=False).head(10)
print("Top 10 Restaurants with Maximum Ratings in Bangalore are ->")
# show(max_ratings[['restaurant','rating','rating_count']])
tabulate(max_ratings[['restaurant','rating','rating_count']], headers = 'keys', tablefmt = 'html')
"""### Top 10 Restaurant in Patna w.r.t rating"""
# first exclude all restaurants with too few ratings
df_ratings = df[df['rating_count']!='Too Few Ratings']
# convert str column to float
df_ratings['rating'] = pd.to_numeric(df_ratings['rating'], errors='coerce')
patna_rest = df_ratings[df_ratings['city'] == 'Patna']
rating_rank(patna_rest)
max_ratings = patna_rest.sort_values(by=['rating','rating_rank'],ascending=False).head(10)
print("Top 10 Restaurants with Maximum Ratings in Patna are ->")
# show(max_ratings[['restaurant','rating','rating_count']])
tabulate(max_ratings[['restaurant','rating','rating_count']], headers = 'keys', tablefmt = 'html')