Exploratory Data Analysis

Contents

As we have large amounf data so we are loading data line by line in dataframe business_df, review_df, user_df

import json

def readjson(filepath):
    data = []
    i=0
    with open(filepath,encoding="utf8") as f:
            for line in f:
                 if i<100000:
                    data.append(json.loads(line))
                    #print(i)
                    i +=1
    return pd.DataFrame(data)

business_df = readjson('./dataset/business.json')
review_df = readjson('./dataset/review.json')
user_df = readjson('./dataset/user.json')

Getting reaturants out of business dataframe based on Food category

business_df['categories'] = business_df['categories'].astype(str)
restaurant_df = business_df[business_df['categories'].str.contains('Food')==True]

complete_df = restaurant_df.merge(review_df,on='business_id').merge(user_df,on='user_id')

complete_df.head(2)
address attributes business_id categories city hours is_open latitude longitude name_x neighborhood postal_code review_count_x stars_x state cool_x date funny_x review_id stars_y text useful_x user_id average_stars compliment_cool compliment_cute compliment_funny compliment_hot compliment_list compliment_more compliment_note compliment_photos compliment_plain compliment_profile compliment_writer cool_y elite fans friends funny_y name_y review_count_y useful_y yelping_since
0 1203 E Charleston Blvd, Ste 140 {'BusinessParking': {'validated': False, 'gara... YTqtM2WFhcMZGeAGA08Cfg ['Seafood', 'Restaurants', 'Specialty Food', '... Las Vegas {'Sunday': '10:15-21:00', 'Wednesday': '10:30-... 1 36.159363 -115.135949 Mariscos Playa Escondida Downtown 89104 330 4.5 NV 0 2016-09-16 1 ZH8g_PoY0Tr3YdQ-RGySrA 5 Great place. There was a man here who was very... 1 EDe16577dBImA1ypOzPlKg 5.00 0 0 0 0 0 0 0 0 0 0 0 0 [] 0 [] 0 Jessica 1 0 2014-07-26
1 1203 E Charleston Blvd, Ste 140 {'BusinessParking': {'validated': False, 'gara... YTqtM2WFhcMZGeAGA08Cfg ['Seafood', 'Restaurants', 'Specialty Food', '... Las Vegas {'Sunday': '10:15-21:00', 'Wednesday': '10:30-... 1 36.159363 -115.135949 Mariscos Playa Escondida Downtown 89104 330 4.5 NV 1 2014-11-13 1 6r2uAJE1dqUq1IHn_3R3qA 4 HOT HOT HOT! Real Mexican Food\n\nNO fake wate... 2 twx2ZgFUbat87vGQ_tFbPA 3.55 0 0 0 2 0 1 3 0 5 0 0 11 [] 3 [eFObFWgDiQJwUiy9WlhOfg, W4KL3Q_AVGfRrWcwR60gK... 29 Edwin 94 317 2010-12-30
restaurant_df.describe()
is_open latitude longitude review_count stars
count 18503.00000 18503.000000 18503.000000 18503.000000 18503.000000
mean 0.83073 39.702568 -87.807760 34.804464 3.546857
std 0.37500 5.747548 27.691971 82.946472 0.889710
min 0.00000 -34.520401 -119.551325 3.000000 1.000000
25% 1.00000 35.135615 -112.013439 5.000000 3.000000
50% 1.00000 40.440368 -81.357777 11.000000 3.500000
75% 1.00000 43.665419 -79.414244 31.000000 4.000000
max 1.00000 59.438181 11.769500 3439.000000 5.000000
user_df.describe()
average_stars compliment_cool compliment_cute compliment_funny compliment_hot compliment_list compliment_more compliment_note compliment_photos compliment_plain compliment_profile compliment_writer cool fans funny review_count useful
count 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000
mean 3.729684 16.342210 0.950070 16.342210 12.015470 0.416970 1.465460 6.980040 5.491070 15.870480 1.046280 6.151540 91.215580 5.103230 64.731610 66.524450 120.838970
std 0.835715 197.424646 16.639768 197.424646 175.458886 7.165452 15.762362 70.410324 153.225409 194.113025 19.474635 73.883346 1509.129416 29.803631 1049.502721 178.975429 1610.123217
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 3.350000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.000000 0.000000
50% 3.810000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 16.000000 2.000000
75% 4.240000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 2.000000 2.000000 3.000000 50.000000 13.000000
max 5.000000 16710.000000 2146.000000 16710.000000 19988.000000 1265.000000 1576.000000 6340.000000 33297.000000 13075.000000 2232.000000 7117.000000 175230.000000 1837.000000 103514.000000 11065.000000 187179.000000
review_df.describe()
cool funny stars useful
count 100000.000000 100000.000000 100000.000000 100000.00000
mean 0.532470 0.411740 3.730530 1.01213
std 1.992121 1.655608 1.418456 2.46252
min 0.000000 0.000000 1.000000 0.00000
25% 0.000000 0.000000 3.000000 0.00000
50% 0.000000 0.000000 4.000000 0.00000
75% 0.000000 0.000000 5.000000 1.00000
max 104.000000 114.000000 5.000000 113.00000
review_df.head(2)
business_id cool date funny review_id stars text useful user_id
0 uYHaNptLzDLoV_JZ_MuzUA 0 2016-07-12 0 VfBHSwC5Vz_pbFluy07i9Q 5 My girlfriend and I stayed here for 3 nights a... 0 cjpdDjZyprfyDG3RlkVG3w
1 uYHaNptLzDLoV_JZ_MuzUA 0 2016-10-02 0 3zRpneRKDsOPq92tq7ybAA 3 If you need an inexpensive place to stay for a... 0 bjTcT8Ty4cJZhEOEo01FGA

Performing Exploratory data analysis

sns.pairplot(restaurant_df.iloc[0:10000,:]);

png

Distribution count of Restaurant rating

We can see below more restaurants get 4 rating than other ratings

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 5))

sns.distplot(restaurant_df.stars,kde=False,color = 'g',ax =ax,bins=20);
ax.axvline(restaurant_df.stars.mean(), 0, 1, color='r', label='Mean')
ax.legend();
ax.set_ylabel('Count',size=20)
ax.set_xlabel('Stars',size=20)
ax.set_title('Distribution(count) of Restaurant rating',size=20);

png

Distribution count of Reviews rating for restaurants

We can see below more reviews have 5 rating than other ratings

#review just for business which are restautrant
review_df_filter_df = review_df.merge(restaurant_df,how='inner',on='business_id')

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 5))
sns.distplot(review_df_filter_df.stars_x,kde=False,color = 'g',ax =ax,bins=20);
ax.axvline(review_df_filter_df.stars_x.mean(), 0, 1, color='r', label='Mean')
ax.legend();
ax.set_ylabel('Count',size=20)
ax.set_xlabel('Stars',size=20)
ax.set_title('Distribution(count) of different Reviews rating',size=20)
Text(0.5,1,'Distribution(count) of different Reviews rating')

png

Distribution count of user rating for restaurants

We can see below users have around mean of 3.7 rating

#user just for business which are restautrant
user_df_filter_df = complete_df.groupby(['user_id'],as_index=False).mean()

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 5))
sns.distplot(user_df_filter_df.average_stars,kde=False,color = 'g',ax =ax,bins=20);
ax.axvline(user_df_filter_df.average_stars.mean(), 0, 1, color='r', label='Mean')
ax.legend();
ax.set_ylabel('Count',size=20)
ax.set_xlabel('Stars',size=20)
ax.set_title('Distribution(count) of User given rating',size=20)

#fig.tight_layout()
Text(0.5,1,'Distribution(count) of User given rating')

png

Scatter plot various features

We can see that useful, funny and cool are correlated

sns.pairplot(review_df_filter_df.iloc[0:10000,:]);

png

Most Reviewed Restaurant

Bouchon at the Venezia Tower is reviewed almost double as compared to others

#get top 20 most reviewed restaurants
n_top =20
most_reviewed_restaurant = restaurant_df.nlargest(n_top, 'review_count')
fig, ax = plt.subplots()
ax = sns.barplot(y="name", x="review_count", data=most_reviewed_restaurant)
ax.set_xlabel('Review Count',size=20)
fig.set_size_inches(12, 8)
plt.title("Most Reviewed Restaurant",fontsize=24);
ax.grid(axis = 'x', color ='green', linestyle='-')
ax.tick_params(axis='both', which='both',length=0)
sns.despine(left=True, bottom=True)

png

Top 10 5 star rated Restaurant

Poke Express is the top 5 star rated restaurant

top_rated_restaurant = restaurant_df.sort_values(by=['stars','review_count'],
                                                 ascending=False)[['name','business_id','review_count','stars']]
#get top 10 5 star rated restaurant
n_top =10
top_rated_restaurant = top_rated_restaurant.nlargest(n_top, 'stars')
fig, ax = plt.subplots()
ax = sns.barplot(y="name", x="review_count", data=top_rated_restaurant)
ax.set_xlabel('Count',size=20)
fig.set_size_inches(12, 8)
plt.title("Top 10 5 star rated Restaurant",fontsize=24);
ax.grid(axis = 'x', color ='green', linestyle='-')
ax.tick_params(axis='both', which='both',length=0)
sns.despine(left=True, bottom=True)

png

Getting different food categories from the restaurant dataframe

top_rated_restaurant = restaurant_df.sort_values(by=['stars','review_count'],
                                                 ascending=False)[['name','business_id','review_count','stars']]
#top_rated_restaurant
def get_food_type_count(category):
    count = restaurant_df[restaurant_df['categories'].str.contains(category)==True]['business_id'].count()
    return count

food_dict = {}
food_categories = ['American','Italian','Mexican','Chinese','Thai','Indian','Japan','French']
for food_category in food_categories:
    food_dict[food_category] = get_food_type_count(food_category)
    

Distribution of review count with respect to Food Categories

We can see American restaurant have higher count of reviews followed by Mexican

plt.figure(figsize=(20,10))
plt.bar(range(len(food_dict)), food_dict.values(), align='center',color='forestgreen')
plt.xticks(range(len(food_dict)), list(food_dict.keys()),fontsize = 15);
plt.title('Distribution of review count with respect to Food Categories',fontsize=18)
plt.xlabel('Food Category',fontsize=18)
plt.ylabel('Count',fontsize=18)

Text(0,0.5,'Count')

png

Distribution(count) of American, Mexican, Italian, Chinese Restaurant rating

We can see American and Italian restaurants are rated higher than other restaurants

American_restaurant_rating_df = restaurant_df[restaurant_df['categories'].str.contains('American')==True][['business_id','stars','categories','name','review_count']]
Mexican_restaurant_rating_df = restaurant_df[restaurant_df['categories'].str.contains('Mexican')==True][['business_id','stars','categories','name','review_count']]
Chinese_restaurant_rating_df = restaurant_df[restaurant_df['categories'].str.contains('Chinese')==True][['business_id','stars','categories','name','review_count']]
Italian_restaurant_rating_df = restaurant_df[restaurant_df['categories'].str.contains('Italian')==True][['business_id','stars','categories','name','review_count']]

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15, 8))
ax = ax.ravel()

def restaurant_category(df, title, ax):

    sns.distplot(df.stars,kde=False,color = 'g',ax =ax,bins=20);
    ax.axvline(df.stars.mean(), 0, 1, color='r', label='Mean')
    ax.legend();
    ax.set_ylabel('Count',size=20)
    ax.set_xlabel('Stars',size=20)
    ax.set_title('Distribution(count) of '+ title + ' Restaurant rating',size=20);

restaurant_category(American_restaurant_rating_df, 'American', ax[0])
restaurant_category(Mexican_restaurant_rating_df, 'Mexican', ax[1])
restaurant_category(Chinese_restaurant_rating_df, 'Chinese', ax[2])
restaurant_category(Italian_restaurant_rating_df, 'Italian', ax[3])

plt.tight_layout()

png

American_restaurant_rating_df.head(2)
business_id stars categories name review_count
34 reWc1g65PNZnKz_Ub9QKOQ 2.5 ['Comfort Food', 'Canadian (New)', 'Restaurant... Milestones Restaurants 51
55 Z1r6b30Tg0n0ME4-Zj2wQQ 3.0 ['American (Traditional)', 'Restaurants', 'Bar... Boardwalk Place 13

Top 20 American 5 star rated Restaurant

American_top_rated_restaurant = American_restaurant_rating_df.sort_values(by=['stars','review_count'],
                                                 ascending=False)[['name','business_id','review_count','stars']]
#get top 20 5 star rated restaurant
n_top =20
American_top_rated_restaurant = American_top_rated_restaurant.nlargest(n_top, 'stars')
fig, ax = plt.subplots()
ax = sns.barplot(y="name", x="review_count", data=American_top_rated_restaurant)
ax.set_xlabel('Count',size=20)
fig.set_size_inches(12, 8)
plt.title("Top 20 American 5 star rated Restaurant",fontsize=24);
ax.grid(axis = 'x', color ='green', linestyle='-')
ax.tick_params(axis='both', which='both',length=0)
sns.despine(left=True, bottom=True)

png

High-count Top 10 users who reviewed Restaurant


#get top 10 most reviewing users
n_top =10
most_review_user = user_df_filter_df.nlargest(n_top, 'review_count_y').reindex()
fig, ax = plt.subplots()
ax = sns.barplot(y="user_id", x="review_count_y", data=most_review_user)
ax.set_xlabel('Review Count',size=20)
fig.set_size_inches(12, 8)
plt.title("High count Top 10 users who reviewed Restaurant ",fontsize=24);
ax.grid(axis = 'x', color ='green', linestyle='-')
ax.tick_params(axis='both', which='both',length=0)
sns.despine(left=True, bottom=True)

png

Distribution of Review Count given by users and given to Restaurant

We can see that most review count is with less number of users and restaurants

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 8))
user_df_filter_df.review_count_y.hist(bins=400,ax=ax[0],color = 'g')
#plt.xlim([0,1000])
ax[0].legend();
ax[0].set_xlim([0,1000])
ax[0].set_ylabel('Review Count',size=20)
ax[0].set_xlabel('Num of Users',size=20)
ax[0].set_title('Distribution of Review Count given by User',size=20);

restaurant_df.review_count.hist(bins=400,ax=ax[1],color = 'g')
ax[1].set_xlim([0,300])
ax[1].legend();
ax[1].set_ylabel('Review Count',size=20)
ax[1].set_xlabel('Num of Restaurant',size=20)
ax[1].set_title('Distribution of Review Count given to Restaurant',size=20);

png