Recommendation System Models

Creating Baseline Model
BaselineOnly Model
KNNBaseline Model
Memory Based Collaborative filtering
- Train Test Split
- Evaluation using RMSE
SVD
Meta Classifier
Model comparison via RMSE

As we have large amounf data so we are loading data line by line in dataframe business_df, review_df, user_df

import json

def readjson(filepath):
    data = []
    i=0
    with open(filepath,encoding="utf8") as f:
            for line in f:
                 if i<100000:
                    data.append(json.loads(line))
                    #print(i)
                    i +=1
    return pd.DataFrame(data)

business_df = readjson('./dataset/business.json')
review_df = readjson('./dataset/review.json')
user_df = readjson('./dataset/user.json')

Getting reaturants out of business dataframe based on Food category

business_df['categories'] = business_df['categories'].astype(str)
restaurant_df = business_df[business_df['categories'].str.contains('Food')==True]

complete_df = restaurant_df.merge(review_df,on='business_id').merge(user_df,on='user_id')

complete_df.head(2)

	address	attributes	business_id	categories	city	hours	is_open	latitude	longitude	name_x	neighborhood	postal_code	review_count_x	stars_x	state	cool_x	date	funny_x	review_id	stars_y	text	useful_x	user_id	average_stars	compliment_cool	compliment_cute	compliment_funny	compliment_hot	compliment_list	compliment_more	compliment_note	compliment_photos	compliment_plain	compliment_profile	compliment_writer	cool_y	elite	fans	friends	funny_y	name_y	review_count_y	useful_y	yelping_since
0	1203 E Charleston Blvd, Ste 140	{'RestaurantsTableService': True, 'GoodForMeal...	YTqtM2WFhcMZGeAGA08Cfg	['Seafood', 'Restaurants', 'Specialty Food', '...	Las Vegas	{'Monday': '10:30-21:00', 'Tuesday': '10:30-21...	1	36.159363	-115.135949	Mariscos Playa Escondida	Downtown	89104	330	4.5	NV	0	2016-09-16	1	ZH8g_PoY0Tr3YdQ-RGySrA	5	Great place. There was a man here who was very...	1	EDe16577dBImA1ypOzPlKg	5.00	0	0	0	0	0	0	0	0	0	0	0	0	[]	0	[]	0	Jessica	1	0	2014-07-26
1	1203 E Charleston Blvd, Ste 140	{'RestaurantsTableService': True, 'GoodForMeal...	YTqtM2WFhcMZGeAGA08Cfg	['Seafood', 'Restaurants', 'Specialty Food', '...	Las Vegas	{'Monday': '10:30-21:00', 'Tuesday': '10:30-21...	1	36.159363	-115.135949	Mariscos Playa Escondida	Downtown	89104	330	4.5	NV	1	2014-11-13	1	6r2uAJE1dqUq1IHn_3R3qA	4	HOT HOT HOT! Real Mexican Food\n\nNO fake wate...	2	twx2ZgFUbat87vGQ_tFbPA	3.55	0	0	0	2	0	1	3	0	5	0	0	11	[]	3	[eFObFWgDiQJwUiy9WlhOfg, W4KL3Q_AVGfRrWcwR60gK...	29	Edwin	94	317	2010-12-30

restaurant_df.describe()

	is_open	latitude	longitude	review_count	stars
count	18503.00000	18503.000000	18503.000000	18503.000000	18503.000000
mean	0.83073	39.702568	-87.807760	34.804464	3.546857
std	0.37500	5.747548	27.691971	82.946472	0.889710
min	0.00000	-34.520401	-119.551325	3.000000	1.000000
25%	1.00000	35.135615	-112.013439	5.000000	3.000000
50%	1.00000	40.440368	-81.357777	11.000000	3.500000
75%	1.00000	43.665419	-79.414244	31.000000	4.000000
max	1.00000	59.438181	11.769500	3439.000000	5.000000

user_df.describe()

	average_stars	compliment_cool	compliment_cute	compliment_funny	compliment_hot	compliment_list	compliment_more	compliment_note	compliment_photos	compliment_plain	compliment_profile	compliment_writer	cool	fans	funny	review_count	useful
count	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000
mean	3.729684	16.342210	0.950070	16.342210	12.015470	0.416970	1.465460	6.980040	5.491070	15.870480	1.046280	6.151540	91.215580	5.103230	64.731610	66.524450	120.838970
std	0.835715	197.424646	16.639768	197.424646	175.458886	7.165452	15.762362	70.410324	153.225409	194.113025	19.474635	73.883346	1509.129416	29.803631	1049.502721	178.975429	1610.123217
min	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	3.350000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	5.000000	0.000000
50%	3.810000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	16.000000	2.000000
75%	4.240000	1.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	2.000000	2.000000	3.000000	50.000000	13.000000
max	5.000000	16710.000000	2146.000000	16710.000000	19988.000000	1265.000000	1576.000000	6340.000000	33297.000000	13075.000000	2232.000000	7117.000000	175230.000000	1837.000000	103514.000000	11065.000000	187179.000000

review_df.describe()

	cool	funny	stars	useful
count	100000.000000	100000.000000	100000.000000	100000.00000
mean	0.532470	0.411740	3.730530	1.01213
std	1.992121	1.655608	1.418456	2.46252
min	0.000000	0.000000	1.000000	0.00000
25%	0.000000	0.000000	3.000000	0.00000
50%	0.000000	0.000000	4.000000	0.00000
75%	0.000000	0.000000	5.000000	1.00000
max	104.000000	114.000000	5.000000	113.00000

review_df.head(2)

	business_id	cool	date	funny	review_id	stars	text	useful	user_id
0	uYHaNptLzDLoV_JZ_MuzUA	0	2016-07-12	0	VfBHSwC5Vz_pbFluy07i9Q	5	My girlfriend and I stayed here for 3 nights a...	0	cjpdDjZyprfyDG3RlkVG3w
1	uYHaNptLzDLoV_JZ_MuzUA	0	2016-10-02	0	3zRpneRKDsOPq92tq7ybAA	3	If you need an inexpensive place to stay for a...	0	bjTcT8Ty4cJZhEOEo01FGA

Creating Baseline Model

complete_df.head(2)

	address	attributes	business_id	categories	city	hours	is_open	latitude	longitude	name_x	neighborhood	postal_code	review_count_x	stars_x	state	cool_x	date	funny_x	review_id	stars_y	text	useful_x	user_id	average_stars	compliment_cool	compliment_cute	compliment_funny	compliment_hot	compliment_list	compliment_more	compliment_note	compliment_photos	compliment_plain	compliment_profile	compliment_writer	cool_y	elite	fans	friends	funny_y	name_y	review_count_y	useful_y	yelping_since
0	1203 E Charleston Blvd, Ste 140	{'RestaurantsTableService': True, 'GoodForMeal...	YTqtM2WFhcMZGeAGA08Cfg	['Seafood', 'Restaurants', 'Specialty Food', '...	Las Vegas	{'Monday': '10:30-21:00', 'Tuesday': '10:30-21...	1	36.159363	-115.135949	Mariscos Playa Escondida	Downtown	89104	330	4.5	NV	0	2016-09-16	1	ZH8g_PoY0Tr3YdQ-RGySrA	5	Great place. There was a man here who was very...	1	EDe16577dBImA1ypOzPlKg	5.00	0	0	0	0	0	0	0	0	0	0	0	0	[]	0	[]	0	Jessica	1	0	2014-07-26
1	1203 E Charleston Blvd, Ste 140	{'RestaurantsTableService': True, 'GoodForMeal...	YTqtM2WFhcMZGeAGA08Cfg	['Seafood', 'Restaurants', 'Specialty Food', '...	Las Vegas	{'Monday': '10:30-21:00', 'Tuesday': '10:30-21...	1	36.159363	-115.135949	Mariscos Playa Escondida	Downtown	89104	330	4.5	NV	1	2014-11-13	1	6r2uAJE1dqUq1IHn_3R3qA	4	HOT HOT HOT! Real Mexican Food\n\nNO fake wate...	2	twx2ZgFUbat87vGQ_tFbPA	3.55	0	0	0	2	0	1	3	0	5	0	0	11	[]	3	[eFObFWgDiQJwUiy9WlhOfg, W4KL3Q_AVGfRrWcwR60gK...	29	Edwin	94	317	2010-12-30

Taking only user_id, business_id, stars_y and using the surprise library(https://pypi.python.org/pypi/scikit-surprise) Algorithm predicting the baseline estimate for given user and item.

display(Math('r^ui=bui=μ+bu+bi'))

$r^ui=bui=μ+bu+bi$

baseline_df = complete_df[['user_id','business_id','stars_y']]

from surprise import SVD,BaselineOnly, Reader,KNNBaseline
from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(baseline_df,reader)
data.split(n_folds=3)

BaselineOnly Model

We used Surprise library for Baseline models. Surprise is a Python scikit for building, and analyzing (collaborative-filtering) recommender systems. Various algorithms are built-in, with a focus on rating prediction. BaselineOnly is an algorithm predicting the baseline estimate for given user and item Ym = μ + su + sm where the unknown parameters su and sm indicate the deviations, or biases, of user u and item m respectively from some intercept parameter.

KNNBaseline is a basic collaborative filtering algorithm taking into account a baseline rating.

algo = BaselineOnly()
perf_baseline = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf_baseline)

Evaluating RMSE, MAE of algorithm BaselineOnly.

------------
Fold 1
Estimating biases using als...
RMSE: 1.2452
MAE:  1.0188
------------
Fold 2
Estimating biases using als...
RMSE: 1.2428
MAE:  1.0015
------------
Fold 3
Estimating biases using als...
RMSE: 1.2510
MAE:  1.0192
------------
------------
Mean RMSE: 1.2463
Mean MAE : 1.0132
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    1.2452  1.2428  1.2510  1.2463  
MAE     1.0188  1.0015  1.0192  1.0132  

KNNBaseline Model

KNN Based on user restaurant rating

display(Math(r'\hat{r}_{ui} = \mu_u + \sigma_u \frac{ \sum\limits_{v \in N^k_i(u)}\text{sim}(u, v) \cdot (r_{vi} - \mu_v) / \sigma_v} {\sum\limits_{v\in N^k_i(u)} \text{sim}(u, v)}'))

$\hat{r}_{ui} = \mu_u + \sigma_u \frac{ \sum\limits_{v \in N^k_i(u)}\text{sim}(u, v) \cdot (r_{vi} - \mu_v) / \sigma_v} {\sum\limits_{v\in N^k_i(u)} \text{sim}(u, v)}$

algo = KNNBaseline()

perf_knn_baseline = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf_knn_baseline)

Evaluating RMSE, MAE of algorithm KNNBaseline.

------------
Fold 1
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2509
MAE:  1.0222
------------
Fold 2
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2517
MAE:  1.0088
------------
Fold 3
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.2550
MAE:  1.0203
------------
------------
Mean RMSE: 1.2525
Mean MAE : 1.0171
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    1.2509  1.2517  1.2550  1.2525  
MAE     1.0222  1.0088  1.0203  1.0171  

Memory Based Collaborative filtering

We used Collaborative filtering. The two primary areas of collaborative filtering are the neighborhood methods and latent factor models.

Neighborhood methods are centered on computing the relationships between items or, alternatively, between users. The item oriented approach evaluates a user’s preference for an item based on ratings of “neighboring” items by the same user. A product’s neighbors are other products that tend to get similar ratings when rated by the same user.

n_users = complete_df['user_id'].nunique()
n_restaurants = complete_df['business_id'].nunique()

print('Number of Unique Users: ', n_users)
print('Number of Restaurant: ',n_restaurants)

Number of Unique Users:  11749
Number of Restaurant:  482

Making user_id and business_id as nominal variable

unique_user_id = pd.DataFrame(complete_df['user_id'].unique(),columns =['user_id']).reset_index()
unique_user_id['new_user_id'] =unique_user_id['index']
del unique_user_id['index']

unique_business_id = pd.DataFrame(complete_df['business_id'].unique(),columns =['business_id']).reset_index()
unique_business_id['new_business_id'] =unique_business_id['index']
del unique_business_id['index']

new_complete_df = complete_df.merge(unique_user_id,on='user_id',how ='left')
new_complete_df = new_complete_df.merge(unique_business_id,on='business_id',how ='left')

new_complete_df.head(2)

	address	attributes	business_id	categories	city	hours	is_open	latitude	longitude	name_x	neighborhood	postal_code	review_count_x	stars_x	state	cool_x	date	funny_x	review_id	stars_y	text	useful_x	user_id	average_stars	compliment_cool	compliment_cute	compliment_funny	compliment_hot	compliment_list	compliment_more	compliment_note	compliment_photos	compliment_plain	compliment_profile	compliment_writer	cool_y	elite	fans	friends	funny_y	name_y	review_count_y	useful_y	yelping_since	new_user_id	new_business_id
0	1203 E Charleston Blvd, Ste 140	{'RestaurantsTableService': True, 'GoodForMeal...	YTqtM2WFhcMZGeAGA08Cfg	['Seafood', 'Restaurants', 'Specialty Food', '...	Las Vegas	{'Monday': '10:30-21:00', 'Tuesday': '10:30-21...	1	36.159363	-115.135949	Mariscos Playa Escondida	Downtown	89104	330	4.5	NV	0	2016-09-16	1	ZH8g_PoY0Tr3YdQ-RGySrA	5	Great place. There was a man here who was very...	1	EDe16577dBImA1ypOzPlKg	5.00	0	0	0	0	0	0	0	0	0	0	0	0	[]	0	[]	0	Jessica	1	0	2014-07-26	0	0
1	1203 E Charleston Blvd, Ste 140	{'RestaurantsTableService': True, 'GoodForMeal...	YTqtM2WFhcMZGeAGA08Cfg	['Seafood', 'Restaurants', 'Specialty Food', '...	Las Vegas	{'Monday': '10:30-21:00', 'Tuesday': '10:30-21...	1	36.159363	-115.135949	Mariscos Playa Escondida	Downtown	89104	330	4.5	NV	1	2014-11-13	1	6r2uAJE1dqUq1IHn_3R3qA	4	HOT HOT HOT! Real Mexican Food\n\nNO fake wate...	2	twx2ZgFUbat87vGQ_tFbPA	3.55	0	0	0	2	0	1	3	0	5	0	0	11	[]	3	[eFObFWgDiQJwUiy9WlhOfg, W4KL3Q_AVGfRrWcwR60gK...	29	Edwin	94	317	2010-12-30	1	0

Train Test Split

from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(new_complete_df, test_size=0.25)

#Creating two,  user and restaurant matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_restaurants))
for row in train_data.itertuples():
    # selecting new_user_id, new_restaurant_id, and rating star
    train_data_matrix[row[45]-1, row[46]-1] = row[20]  

test_data_matrix = np.zeros((n_users, n_restaurants))
for line in test_data.itertuples():
    test_data_matrix[row[45]-1, row[46]-1] = row[20]  

from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
restaurant_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

def predict_rating(num_rating, sim, type='user'):
    if type == 'user':
        user_rating_avg = num_rating.mean(axis=1)
        ratings_difference = (num_rating - user_rating_avg[:, np.newaxis]) 
        prediction = user_rating_avg[:, np.newaxis] + sim.dot(ratings_difference) / np.array([np.abs(sim).sum(axis=1)]).T
    elif type == 'restaurant':
        prediction = num_rating.dot(sim) / np.array([np.abs(sim).sum(axis=1)])     
    return prediction

restaurant_prediction = predict_rating(train_data_matrix, restaurant_similarity, type='restaurant')
user_prediction = predict_rating(train_data_matrix, user_similarity, type='user')

restaurant_prediction_test = predict_rating(test_data_matrix, restaurant_similarity, type='restaurant')
user_prediction_test = predict_rating(test_data_matrix, user_similarity, type='user')

model_memory_based_pred_res = restaurant_prediction
model_memory_based_pred_user = user_prediction

model_memory_based_pred_res_test = restaurant_prediction_test
model_memory_based_pred_user_test = user_prediction_test

Evaluation using RMSE

from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, true_value):
    prediction = prediction[true_value.nonzero()].flatten() 
    true_value = true_value[true_value.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, true_value))

print('RMSE for training  User based Collaborative filtering:', (rmse(user_prediction, train_data_matrix)))
print('RMSE for training Restaurant based Collaborative filtering: ', (rmse(restaurant_prediction, train_data_matrix)))
print('RMSE for testing  User based Collaborative filtering:', (rmse(user_prediction_test, test_data_matrix)))
print('RMSE for testing Restaurant based Collaborative filtering: ', (rmse(restaurant_prediction_test, test_data_matrix)))

RMSE for training  User based Collaborative filtering: 3.920373589539869
RMSE for training Restaurant based Collaborative filtering:  3.924467145133183
RMSE for testing  User based Collaborative filtering: 4.9896265560165975
RMSE for testing Restaurant based Collaborative filtering:  5.0

SVD

Latent factor models (aka SVD) are an alternative approach that tries to explain the ratings by characterizing both items and users on number of factors inferred from the ratings patterns. Latent factor models are based on matrix factorization which characterizes both items and users by vectors of factors inferred from item rating patterns. High correspondence between item and user factors leads to a recommendation. From the results, we can see that prediction accuracy has improved by considering also implicit feedback, which provides an additional indication of user preferences.

#Using libraries
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k =10)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

u_test, s_test, vt_test = svds(test_data_matrix, k =10)
X_pred_test = np.dot(np.dot(u_test, s_diag_matrix), vt)

print('RMSE for training User based SVD Collaborative filtering: ', (rmse(X_pred, train_data_matrix)))
print('RMSE for testing User based SVD Collaborative filtering: ', (rmse(X_pred_test, test_data_matrix)))

RMSE for training User based SVD Collaborative filtering:  3.3716947413739393
RMSE for testing User based SVD Collaborative filtering:  4.999999999650054

Meta Classifier

We have used multiple models (neighborhoods & SVD) whose individual predictions are combined to classify new examples. Integration should improve predictive accuracy. Each of the models has a mediocre accuracy rate. We would have to increase the importance of the model with high accuracy, and reduce the importance of the models with lower accuracy. To do this in Python, one may use the predicted values as the predictors in a Logistic Regression model, and the corresponding y as the response. Logistic Regression can take the “importance” of each model into account: the “predictors” or models that do well most of the time will have the more significant coefficients.

model_svd_based_pred = X_pred
model_svd_based_pred_test = X_pred_test

model_memory_based_pred_res_flat = model_memory_based_pred_res.ravel()
model_memory_based_pred_user_flat = model_memory_based_pred_user.ravel()
model_svd_based_pred_flat = model_svd_based_pred.ravel()

model_memory_based_pred_res_test_flat = model_memory_based_pred_res_test.ravel()
model_memory_based_pred_user_test_flat = model_memory_based_pred_user_test.ravel()
model_svd_based_pred_test_flat = model_svd_based_pred_test.ravel()

pred_model_array_train =  np.zeros((model_memory_based_pred_res_flat.size,3))
pred_model_array_test =  np.zeros((model_memory_based_pred_res_test_flat.size,3))

pred_model_array_train[:,0] = model_memory_based_pred_res_flat
pred_model_array_train[:,1] = model_memory_based_pred_user_flat 
pred_model_array_train[:,2] = model_svd_based_pred_flat

pred_model_array_test[:,0] = model_memory_based_pred_res_test_flat
pred_model_array_test[:,1] = model_memory_based_pred_user_test_flat 
pred_model_array_test[:,2] = model_svd_based_pred_test_flat

y_train_data_matrix_flat = train_data_matrix.ravel()
y_test_data_matrix_flat = test_data_matrix.ravel()

def rmse_new(prediction, true_value):
    return sqrt(mean_squared_error(prediction, true_value))

from sklearn.metrics import mean_squared_error
logreg = LogisticRegressionCV()
y_hat_train = logreg.fit(pred_model_array_train[0:100000], y_train_data_matrix_flat[0:100000]).predict(pred_model_array_train)
y_hat_test = logreg.fit(pred_model_array_train[0:100000], y_train_data_matrix_flat[0:100000]).predict(pred_model_array_test)

print("Test LogReg RMSE: ", rmse_new(y_test_data_matrix_flat, y_hat_test))
print("Train LogReg RMSE: ", rmse_new(y_train_data_matrix_flat, y_hat_train))

Test LogReg RMSE:  0.3965122913294842
Train LogReg RMSE:  0.16192899952388481

print_perf(perf_baseline)

        Fold 1  Fold 2  Fold 3  Mean    
RMSE    1.2452  1.2428  1.2510  1.2463  
MAE     1.0188  1.0015  1.0192  1.0132  

print_perf(perf_knn_baseline)

        Fold 1  Fold 2  Fold 3  Mean    
RMSE    1.2509  1.2517  1.2550  1.2525  
MAE     1.0222  1.0088  1.0203  1.0171  

Model comparison via RMSE

meta_clf_scores_tr = rmse_new(y_train_data_matrix_flat, y_hat_train)
SVD_cf_scores_tr = rmse(X_pred, train_data_matrix)
memory_user_based_cf_scores_tr = rmse(user_prediction, train_data_matrix)
memory_restaurant_based_cf_scores_tr = rmse(restaurant_prediction, train_data_matrix)

meta_clf_scores_ts = rmse_new(y_test_data_matrix_flat, y_hat_test)
SVD_cf_scores_ts = rmse(X_pred_test, test_data_matrix)
memory_user_based_cf_scores_ts = rmse(user_prediction_test, test_data_matrix)
memory_restaurant_based_cf_scores_ts = rmse(restaurant_prediction_test, test_data_matrix)
   
score = [meta_clf_scores_tr,SVD_cf_scores_tr,memory_user_based_cf_scores_tr,memory_restaurant_based_cf_scores_tr,
        meta_clf_scores_ts,SVD_cf_scores_ts,memory_user_based_cf_scores_ts,memory_restaurant_based_cf_scores_ts]


pd.DataFrame(np.array(score).reshape(2,4), columns = ['Meta Classifer','SVD Collaborative Filtetering','Memory Based User Collaborative Filering',
                        'Memory Based Restaurant Collaborative Filtering'], index = ['RMSE in Training','RMSE in Testing'])

	Meta Classifer	SVD Collaborative Filtetering	Memory Based User Collaborative Filering	Memory Based Restaurant Collaborative Filtering
RMSE in Training	0.161929	3.371695	3.920374	3.924467
RMSE in Testing	0.396512	5.000000	4.989627	5.000000

We can see above that meta Classifier is working better than other models