As we have large amounf data so we are loading data line by line in dataframe business_df, review_df, user_df
import json
def readjson(filepath):
data = []
i=0
with open(filepath,encoding="utf8") as f:
for line in f:
if i<100000:
data.append(json.loads(line))
#print(i)
i +=1
return pd.DataFrame(data)
business_df = readjson('./dataset/business.json')
review_df = readjson('./dataset/review.json')
user_df = readjson('./dataset/user.json')
Getting reaturants out of business dataframe based on Food category
business_df['categories'] = business_df['categories'].astype(str)
restaurant_df = business_df[business_df['categories'].str.contains('Food')==True]
complete_df = restaurant_df.merge(review_df,on='business_id').merge(user_df,on='user_id')
complete_df.head(2)
address | attributes | business_id | categories | city | hours | is_open | latitude | longitude | name_x | neighborhood | postal_code | review_count_x | stars_x | state | cool_x | date | funny_x | review_id | stars_y | text | useful_x | user_id | average_stars | compliment_cool | compliment_cute | compliment_funny | compliment_hot | compliment_list | compliment_more | compliment_note | compliment_photos | compliment_plain | compliment_profile | compliment_writer | cool_y | elite | fans | friends | funny_y | name_y | review_count_y | useful_y | yelping_since | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1203 E Charleston Blvd, Ste 140 | {'RestaurantsTableService': True, 'GoodForMeal... | YTqtM2WFhcMZGeAGA08Cfg | ['Seafood', 'Restaurants', 'Specialty Food', '... | Las Vegas | {'Monday': '10:30-21:00', 'Tuesday': '10:30-21... | 1 | 36.159363 | -115.135949 | Mariscos Playa Escondida | Downtown | 89104 | 330 | 4.5 | NV | 0 | 2016-09-16 | 1 | ZH8g_PoY0Tr3YdQ-RGySrA | 5 | Great place. There was a man here who was very... | 1 | EDe16577dBImA1ypOzPlKg | 5.00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [] | 0 | [] | 0 | Jessica | 1 | 0 | 2014-07-26 |
1 | 1203 E Charleston Blvd, Ste 140 | {'RestaurantsTableService': True, 'GoodForMeal... | YTqtM2WFhcMZGeAGA08Cfg | ['Seafood', 'Restaurants', 'Specialty Food', '... | Las Vegas | {'Monday': '10:30-21:00', 'Tuesday': '10:30-21... | 1 | 36.159363 | -115.135949 | Mariscos Playa Escondida | Downtown | 89104 | 330 | 4.5 | NV | 1 | 2014-11-13 | 1 | 6r2uAJE1dqUq1IHn_3R3qA | 4 | HOT HOT HOT! Real Mexican Food\n\nNO fake wate... | 2 | twx2ZgFUbat87vGQ_tFbPA | 3.55 | 0 | 0 | 0 | 2 | 0 | 1 | 3 | 0 | 5 | 0 | 0 | 11 | [] | 3 | [eFObFWgDiQJwUiy9WlhOfg, W4KL3Q_AVGfRrWcwR60gK... | 29 | Edwin | 94 | 317 | 2010-12-30 |
restaurant_df.describe()
is_open | latitude | longitude | review_count | stars | |
---|---|---|---|---|---|
count | 18503.00000 | 18503.000000 | 18503.000000 | 18503.000000 | 18503.000000 |
mean | 0.83073 | 39.702568 | -87.807760 | 34.804464 | 3.546857 |
std | 0.37500 | 5.747548 | 27.691971 | 82.946472 | 0.889710 |
min | 0.00000 | -34.520401 | -119.551325 | 3.000000 | 1.000000 |
25% | 1.00000 | 35.135615 | -112.013439 | 5.000000 | 3.000000 |
50% | 1.00000 | 40.440368 | -81.357777 | 11.000000 | 3.500000 |
75% | 1.00000 | 43.665419 | -79.414244 | 31.000000 | 4.000000 |
max | 1.00000 | 59.438181 | 11.769500 | 3439.000000 | 5.000000 |
user_df.describe()
average_stars | compliment_cool | compliment_cute | compliment_funny | compliment_hot | compliment_list | compliment_more | compliment_note | compliment_photos | compliment_plain | compliment_profile | compliment_writer | cool | fans | funny | review_count | useful | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 |
mean | 3.729684 | 16.342210 | 0.950070 | 16.342210 | 12.015470 | 0.416970 | 1.465460 | 6.980040 | 5.491070 | 15.870480 | 1.046280 | 6.151540 | 91.215580 | 5.103230 | 64.731610 | 66.524450 | 120.838970 |
std | 0.835715 | 197.424646 | 16.639768 | 197.424646 | 175.458886 | 7.165452 | 15.762362 | 70.410324 | 153.225409 | 194.113025 | 19.474635 | 73.883346 | 1509.129416 | 29.803631 | 1049.502721 | 178.975429 | 1610.123217 |
min | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 3.350000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 0.000000 |
50% | 3.810000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 16.000000 | 2.000000 |
75% | 4.240000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 2.000000 | 2.000000 | 3.000000 | 50.000000 | 13.000000 |
max | 5.000000 | 16710.000000 | 2146.000000 | 16710.000000 | 19988.000000 | 1265.000000 | 1576.000000 | 6340.000000 | 33297.000000 | 13075.000000 | 2232.000000 | 7117.000000 | 175230.000000 | 1837.000000 | 103514.000000 | 11065.000000 | 187179.000000 |
review_df.describe()
cool | funny | stars | useful | |
---|---|---|---|---|
count | 100000.000000 | 100000.000000 | 100000.000000 | 100000.00000 |
mean | 0.532470 | 0.411740 | 3.730530 | 1.01213 |
std | 1.992121 | 1.655608 | 1.418456 | 2.46252 |
min | 0.000000 | 0.000000 | 1.000000 | 0.00000 |
25% | 0.000000 | 0.000000 | 3.000000 | 0.00000 |
50% | 0.000000 | 0.000000 | 4.000000 | 0.00000 |
75% | 0.000000 | 0.000000 | 5.000000 | 1.00000 |
max | 104.000000 | 114.000000 | 5.000000 | 113.00000 |
review_df.head(2)
business_id | cool | date | funny | review_id | stars | text | useful | user_id | |
---|---|---|---|---|---|---|---|---|---|
0 | uYHaNptLzDLoV_JZ_MuzUA | 0 | 2016-07-12 | 0 | VfBHSwC5Vz_pbFluy07i9Q | 5 | My girlfriend and I stayed here for 3 nights a... | 0 | cjpdDjZyprfyDG3RlkVG3w |
1 | uYHaNptLzDLoV_JZ_MuzUA | 0 | 2016-10-02 | 0 | 3zRpneRKDsOPq92tq7ybAA | 3 | If you need an inexpensive place to stay for a... | 0 | bjTcT8Ty4cJZhEOEo01FGA |
complete_df.head(2)
address | attributes | business_id | categories | city | hours | is_open | latitude | longitude | name_x | neighborhood | postal_code | review_count_x | stars_x | state | cool_x | date | funny_x | review_id | stars_y | text | useful_x | user_id | average_stars | compliment_cool | compliment_cute | compliment_funny | compliment_hot | compliment_list | compliment_more | compliment_note | compliment_photos | compliment_plain | compliment_profile | compliment_writer | cool_y | elite | fans | friends | funny_y | name_y | review_count_y | useful_y | yelping_since | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1203 E Charleston Blvd, Ste 140 | {'RestaurantsTableService': True, 'GoodForMeal... | YTqtM2WFhcMZGeAGA08Cfg | ['Seafood', 'Restaurants', 'Specialty Food', '... | Las Vegas | {'Monday': '10:30-21:00', 'Tuesday': '10:30-21... | 1 | 36.159363 | -115.135949 | Mariscos Playa Escondida | Downtown | 89104 | 330 | 4.5 | NV | 0 | 2016-09-16 | 1 | ZH8g_PoY0Tr3YdQ-RGySrA | 5 | Great place. There was a man here who was very... | 1 | EDe16577dBImA1ypOzPlKg | 5.00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [] | 0 | [] | 0 | Jessica | 1 | 0 | 2014-07-26 |
1 | 1203 E Charleston Blvd, Ste 140 | {'RestaurantsTableService': True, 'GoodForMeal... | YTqtM2WFhcMZGeAGA08Cfg | ['Seafood', 'Restaurants', 'Specialty Food', '... | Las Vegas | {'Monday': '10:30-21:00', 'Tuesday': '10:30-21... | 1 | 36.159363 | -115.135949 | Mariscos Playa Escondida | Downtown | 89104 | 330 | 4.5 | NV | 1 | 2014-11-13 | 1 | 6r2uAJE1dqUq1IHn_3R3qA | 4 | HOT HOT HOT! Real Mexican Food\n\nNO fake wate... | 2 | twx2ZgFUbat87vGQ_tFbPA | 3.55 | 0 | 0 | 0 | 2 | 0 | 1 | 3 | 0 | 5 | 0 | 0 | 11 | [] | 3 | [eFObFWgDiQJwUiy9WlhOfg, W4KL3Q_AVGfRrWcwR60gK... | 29 | Edwin | 94 | 317 | 2010-12-30 |
Taking only user_id, business_id, stars_y and using the surprise library(https://pypi.python.org/pypi/scikit-surprise) Algorithm predicting the baseline estimate for given user and item.
display(Math('r^ui=bui=μ+bu+bi'))
baseline_df = complete_df[['user_id','business_id','stars_y']]
from surprise import SVD,BaselineOnly, Reader,KNNBaseline
from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(baseline_df,reader)
data.split(n_folds=3)
We used Surprise library for Baseline models. Surprise is a Python scikit for building, and analyzing (collaborative-filtering) recommender systems. Various algorithms are built-in, with a focus on rating prediction. BaselineOnly is an algorithm predicting the baseline estimate for given user and item Ym = μ + su + sm where the unknown parameters su and sm indicate the deviations, or biases, of user u and item m respectively from some intercept parameter.
KNNBaseline is a basic collaborative filtering algorithm taking into account a baseline rating.
algo = BaselineOnly()
perf_baseline = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf_baseline)
Evaluating RMSE, MAE of algorithm BaselineOnly.
------------
Fold 1
Estimating biases using als...
RMSE: 1.2452
MAE: 1.0188
------------
Fold 2
Estimating biases using als...
RMSE: 1.2428
MAE: 1.0015
------------
Fold 3
Estimating biases using als...
RMSE: 1.2510
MAE: 1.0192
------------
------------
Mean RMSE: 1.2463
Mean MAE : 1.0132
------------
------------
Fold 1 Fold 2 Fold 3 Mean
RMSE 1.2452 1.2428 1.2510 1.2463
MAE 1.0188 1.0015 1.0192 1.0132
KNN Based on user restaurant rating
display(Math(r'\hat{r}_{ui} = \mu_u + \sigma_u \frac{ \sum\limits_{v \in N^k_i(u)}\text{sim}(u, v) \cdot (r_{vi} - \mu_v) / \sigma_v} {\sum\limits_{v\in N^k_i(u)} \text{sim}(u, v)}'))
algo = KNNBaseline()
perf_knn_baseline = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf_knn_baseline)
Evaluating RMSE, MAE of algorithm KNNBaseline.
------------
Fold 1
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2509
MAE: 1.0222
------------
Fold 2
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2517
MAE: 1.0088
------------
Fold 3
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2550
MAE: 1.0203
------------
------------
Mean RMSE: 1.2525
Mean MAE : 1.0171
------------
------------
Fold 1 Fold 2 Fold 3 Mean
RMSE 1.2509 1.2517 1.2550 1.2525
MAE 1.0222 1.0088 1.0203 1.0171
We used Collaborative filtering. The two primary areas of collaborative filtering are the neighborhood methods and latent factor models.
Neighborhood methods are centered on computing the relationships between items or, alternatively, between users. The item oriented approach evaluates a user’s preference for an item based on ratings of “neighboring” items by the same user. A product’s neighbors are other products that tend to get similar ratings when rated by the same user.
n_users = complete_df['user_id'].nunique()
n_restaurants = complete_df['business_id'].nunique()
print('Number of Unique Users: ', n_users)
print('Number of Restaurant: ',n_restaurants)
Number of Unique Users: 11749
Number of Restaurant: 482
Making user_id and business_id as nominal variable
unique_user_id = pd.DataFrame(complete_df['user_id'].unique(),columns =['user_id']).reset_index()
unique_user_id['new_user_id'] =unique_user_id['index']
del unique_user_id['index']
unique_business_id = pd.DataFrame(complete_df['business_id'].unique(),columns =['business_id']).reset_index()
unique_business_id['new_business_id'] =unique_business_id['index']
del unique_business_id['index']
new_complete_df = complete_df.merge(unique_user_id,on='user_id',how ='left')
new_complete_df = new_complete_df.merge(unique_business_id,on='business_id',how ='left')
new_complete_df.head(2)
address | attributes | business_id | categories | city | hours | is_open | latitude | longitude | name_x | neighborhood | postal_code | review_count_x | stars_x | state | cool_x | date | funny_x | review_id | stars_y | text | useful_x | user_id | average_stars | compliment_cool | compliment_cute | compliment_funny | compliment_hot | compliment_list | compliment_more | compliment_note | compliment_photos | compliment_plain | compliment_profile | compliment_writer | cool_y | elite | fans | friends | funny_y | name_y | review_count_y | useful_y | yelping_since | new_user_id | new_business_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1203 E Charleston Blvd, Ste 140 | {'RestaurantsTableService': True, 'GoodForMeal... | YTqtM2WFhcMZGeAGA08Cfg | ['Seafood', 'Restaurants', 'Specialty Food', '... | Las Vegas | {'Monday': '10:30-21:00', 'Tuesday': '10:30-21... | 1 | 36.159363 | -115.135949 | Mariscos Playa Escondida | Downtown | 89104 | 330 | 4.5 | NV | 0 | 2016-09-16 | 1 | ZH8g_PoY0Tr3YdQ-RGySrA | 5 | Great place. There was a man here who was very... | 1 | EDe16577dBImA1ypOzPlKg | 5.00 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [] | 0 | [] | 0 | Jessica | 1 | 0 | 2014-07-26 | 0 | 0 |
1 | 1203 E Charleston Blvd, Ste 140 | {'RestaurantsTableService': True, 'GoodForMeal... | YTqtM2WFhcMZGeAGA08Cfg | ['Seafood', 'Restaurants', 'Specialty Food', '... | Las Vegas | {'Monday': '10:30-21:00', 'Tuesday': '10:30-21... | 1 | 36.159363 | -115.135949 | Mariscos Playa Escondida | Downtown | 89104 | 330 | 4.5 | NV | 1 | 2014-11-13 | 1 | 6r2uAJE1dqUq1IHn_3R3qA | 4 | HOT HOT HOT! Real Mexican Food\n\nNO fake wate... | 2 | twx2ZgFUbat87vGQ_tFbPA | 3.55 | 0 | 0 | 0 | 2 | 0 | 1 | 3 | 0 | 5 | 0 | 0 | 11 | [] | 3 | [eFObFWgDiQJwUiy9WlhOfg, W4KL3Q_AVGfRrWcwR60gK... | 29 | Edwin | 94 | 317 | 2010-12-30 | 1 | 0 |
from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(new_complete_df, test_size=0.25)
#Creating two, user and restaurant matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_restaurants))
for row in train_data.itertuples():
# selecting new_user_id, new_restaurant_id, and rating star
train_data_matrix[row[45]-1, row[46]-1] = row[20]
test_data_matrix = np.zeros((n_users, n_restaurants))
for line in test_data.itertuples():
test_data_matrix[row[45]-1, row[46]-1] = row[20]
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
restaurant_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
def predict_rating(num_rating, sim, type='user'):
if type == 'user':
user_rating_avg = num_rating.mean(axis=1)
ratings_difference = (num_rating - user_rating_avg[:, np.newaxis])
prediction = user_rating_avg[:, np.newaxis] + sim.dot(ratings_difference) / np.array([np.abs(sim).sum(axis=1)]).T
elif type == 'restaurant':
prediction = num_rating.dot(sim) / np.array([np.abs(sim).sum(axis=1)])
return prediction
restaurant_prediction = predict_rating(train_data_matrix, restaurant_similarity, type='restaurant')
user_prediction = predict_rating(train_data_matrix, user_similarity, type='user')
restaurant_prediction_test = predict_rating(test_data_matrix, restaurant_similarity, type='restaurant')
user_prediction_test = predict_rating(test_data_matrix, user_similarity, type='user')
model_memory_based_pred_res = restaurant_prediction
model_memory_based_pred_user = user_prediction
model_memory_based_pred_res_test = restaurant_prediction_test
model_memory_based_pred_user_test = user_prediction_test
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, true_value):
prediction = prediction[true_value.nonzero()].flatten()
true_value = true_value[true_value.nonzero()].flatten()
return sqrt(mean_squared_error(prediction, true_value))
print('RMSE for training User based Collaborative filtering:', (rmse(user_prediction, train_data_matrix)))
print('RMSE for training Restaurant based Collaborative filtering: ', (rmse(restaurant_prediction, train_data_matrix)))
print('RMSE for testing User based Collaborative filtering:', (rmse(user_prediction_test, test_data_matrix)))
print('RMSE for testing Restaurant based Collaborative filtering: ', (rmse(restaurant_prediction_test, test_data_matrix)))
RMSE for training User based Collaborative filtering: 3.920373589539869
RMSE for training Restaurant based Collaborative filtering: 3.924467145133183
RMSE for testing User based Collaborative filtering: 4.9896265560165975
RMSE for testing Restaurant based Collaborative filtering: 5.0
Latent factor models (aka SVD) are an alternative approach that tries to explain the ratings by characterizing both items and users on number of factors inferred from the ratings patterns. Latent factor models are based on matrix factorization which characterizes both items and users by vectors of factors inferred from item rating patterns. High correspondence between item and user factors leads to a recommendation. From the results, we can see that prediction accuracy has improved by considering also implicit feedback, which provides an additional indication of user preferences.
#Using libraries
import scipy.sparse as sp
from scipy.sparse.linalg import svds
#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k =10)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
u_test, s_test, vt_test = svds(test_data_matrix, k =10)
X_pred_test = np.dot(np.dot(u_test, s_diag_matrix), vt)
print('RMSE for training User based SVD Collaborative filtering: ', (rmse(X_pred, train_data_matrix)))
print('RMSE for testing User based SVD Collaborative filtering: ', (rmse(X_pred_test, test_data_matrix)))
RMSE for training User based SVD Collaborative filtering: 3.3716947413739393
RMSE for testing User based SVD Collaborative filtering: 4.999999999650054
We have used multiple models (neighborhoods & SVD) whose individual predictions are combined to classify new examples. Integration should improve predictive accuracy. Each of the models has a mediocre accuracy rate. We would have to increase the importance of the model with high accuracy, and reduce the importance of the models with lower accuracy. To do this in Python, one may use the predicted values as the predictors in a Logistic Regression model, and the corresponding y as the response. Logistic Regression can take the “importance” of each model into account: the “predictors” or models that do well most of the time will have the more significant coefficients.
model_svd_based_pred = X_pred
model_svd_based_pred_test = X_pred_test
model_memory_based_pred_res_flat = model_memory_based_pred_res.ravel()
model_memory_based_pred_user_flat = model_memory_based_pred_user.ravel()
model_svd_based_pred_flat = model_svd_based_pred.ravel()
model_memory_based_pred_res_test_flat = model_memory_based_pred_res_test.ravel()
model_memory_based_pred_user_test_flat = model_memory_based_pred_user_test.ravel()
model_svd_based_pred_test_flat = model_svd_based_pred_test.ravel()
pred_model_array_train = np.zeros((model_memory_based_pred_res_flat.size,3))
pred_model_array_test = np.zeros((model_memory_based_pred_res_test_flat.size,3))
pred_model_array_train[:,0] = model_memory_based_pred_res_flat
pred_model_array_train[:,1] = model_memory_based_pred_user_flat
pred_model_array_train[:,2] = model_svd_based_pred_flat
pred_model_array_test[:,0] = model_memory_based_pred_res_test_flat
pred_model_array_test[:,1] = model_memory_based_pred_user_test_flat
pred_model_array_test[:,2] = model_svd_based_pred_test_flat
y_train_data_matrix_flat = train_data_matrix.ravel()
y_test_data_matrix_flat = test_data_matrix.ravel()
def rmse_new(prediction, true_value):
return sqrt(mean_squared_error(prediction, true_value))
from sklearn.metrics import mean_squared_error
logreg = LogisticRegressionCV()
y_hat_train = logreg.fit(pred_model_array_train[0:100000], y_train_data_matrix_flat[0:100000]).predict(pred_model_array_train)
y_hat_test = logreg.fit(pred_model_array_train[0:100000], y_train_data_matrix_flat[0:100000]).predict(pred_model_array_test)
print("Test LogReg RMSE: ", rmse_new(y_test_data_matrix_flat, y_hat_test))
print("Train LogReg RMSE: ", rmse_new(y_train_data_matrix_flat, y_hat_train))
Test LogReg RMSE: 0.3965122913294842
Train LogReg RMSE: 0.16192899952388481
print_perf(perf_baseline)
Fold 1 Fold 2 Fold 3 Mean
RMSE 1.2452 1.2428 1.2510 1.2463
MAE 1.0188 1.0015 1.0192 1.0132
print_perf(perf_knn_baseline)
Fold 1 Fold 2 Fold 3 Mean
RMSE 1.2509 1.2517 1.2550 1.2525
MAE 1.0222 1.0088 1.0203 1.0171
meta_clf_scores_tr = rmse_new(y_train_data_matrix_flat, y_hat_train)
SVD_cf_scores_tr = rmse(X_pred, train_data_matrix)
memory_user_based_cf_scores_tr = rmse(user_prediction, train_data_matrix)
memory_restaurant_based_cf_scores_tr = rmse(restaurant_prediction, train_data_matrix)
meta_clf_scores_ts = rmse_new(y_test_data_matrix_flat, y_hat_test)
SVD_cf_scores_ts = rmse(X_pred_test, test_data_matrix)
memory_user_based_cf_scores_ts = rmse(user_prediction_test, test_data_matrix)
memory_restaurant_based_cf_scores_ts = rmse(restaurant_prediction_test, test_data_matrix)
score = [meta_clf_scores_tr,SVD_cf_scores_tr,memory_user_based_cf_scores_tr,memory_restaurant_based_cf_scores_tr,
meta_clf_scores_ts,SVD_cf_scores_ts,memory_user_based_cf_scores_ts,memory_restaurant_based_cf_scores_ts]
pd.DataFrame(np.array(score).reshape(2,4), columns = ['Meta Classifer','SVD Collaborative Filtetering','Memory Based User Collaborative Filering',
'Memory Based Restaurant Collaborative Filtering'], index = ['RMSE in Training','RMSE in Testing'])
Meta Classifer | SVD Collaborative Filtetering | Memory Based User Collaborative Filering | Memory Based Restaurant Collaborative Filtering | |
---|---|---|---|---|
RMSE in Training | 0.161929 | 3.371695 | 3.920374 | 3.924467 |
RMSE in Testing | 0.396512 | 5.000000 | 4.989627 | 5.000000 |
We can see above that meta Classifier is working better than other models