-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
372 lines (300 loc) · 12.4 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
import time
import pandas as pd
import numpy as np
from neo4j import GraphDatabase
from flask import Flask, request
app = Flask(__name__)
@app.route('/')
def test():
return """
<html lang = "en">
<head>
<title>recinpu.html</title>
<meta charset = "UTF-8" />
</head>
<body>
<h1>Get Yelp Recommendations!</h1>
<form method="POST"]\>
<fieldset>
<legend>Your Review History(be honest!)</legend>
<p>
<label>How cool are your reviews?</label>
<select name = "cool">
<option value = "cool1">one</option>
<option value = "cool2">two</option>
<option value = "cool3">three</option>
<option value = "cool4">four</option>
</select>
<br>
<label>How funny are your reviews?</label>
<select name = "funny">
<option value = "funny1">one</option>
<option value = "funny2">two</option>
<option value = "funny3">three</option>
<option value = "funny4">four</option>
</select>
<br>
<label>How useful are your reviews?</label>
<select name = "useful">
<option value = "useful1">one</option>
<option value = "useful2">two</option>
<option value = "useful3">three</option>
<option value = "useful4">four</option>
<option value = "useful5">five</option>
<option value = "useful6">six</option>
<option value = "useful7">seven</option>
</select>
</p>
</fieldset>
<br>
<fieldset>
<legend>Rate some Businesses</legend>
<p>
<label>Yagyu Ramen</label>
<select name = "5yZ1XmDcOEsElDeb9PlPDQ">
<option value = "1">one star</option>
<option value = "2">two stars</option>
<option value = "3">three stars</option>
<option value = "4">four stars</option>
<option value = "5">five stars</option>
</select>
<br>
<label>KUMI by Chef Akira Back</label>
<select name = "PL3cimEUfNHlenOGSOAdJg">
<option value = "1">one star</option>
<option value = "2">two stars</option>
<option value = "3">three stars</option>
<option value = "4">four stars</option>
<option value = "5">five stars</option>
</select>
<br>
<label>Vince Neil's Tatuado | Eat, Drink, Party</label>
<select name = "4n81G-pmC3rfhmaPsbwYKg">
<option value = "1">one star</option>
<option value = "2">two stars</option>
<option value = "3">three stars</option>
<option value = "4">four stars</option>
<option value = "5">five stars</option>
</select>
<br>
<label>Mr G's Pub & Grub</label>
<select name = "iwGhazq9eP51PSerTrMrwg">
<option value = "1">one star</option>
<option value = "2">two stars</option>
<option value = "3">three stars</option>
<option value = "4">four stars</option>
<option value = "5">five stars</option>
</select>
<br>
<label>RA Sushi Bar Restaurant</label>
<select name = "R3TC2oq8fQK9c9BNMZ-ynA">
<option value = "1">one star</option>
<option value = "2">two stars</option>
<option value = "3">three stars</option>
<option value = "4">four stars</option>
<option value = "5">five stars</option>
</select>
</fieldset>
</p>
</fieldset>
<input type="submit" value="Submit">
</form>
</body>
</html>
"""
@app.route('/', methods=['POST'])
def page_input():
""" This is the function that takes the input from the initial web page and then computes the
recommendations and serves back the results page with the links to the yelp pages for the
recomended pages"""
# this driver variable just tells the functions later that we will not be making calls to the
# neo4j server, all the data we need is stored locally If the database is changed or we want to
# get a new selection of businesses to use for this demo, we will need to connect to the server
# again so the option has to remain.
#GraphDatabase.driver(uri, auth=("neo4j", "password"))
driver = 0
biz_cats = pd.read_pickle("data/biz_cats")
test_businesses = pd.read_pickle("data/test_businesses")
sample_businesses = test_businesses.sample(30)
user_cat_ids = [
request.form['cool'],
request.form['funny'],
request.form['useful']]
ratings = []
for key in [
'5yZ1XmDcOEsElDeb9PlPDQ',
'PL3cimEUfNHlenOGSOAdJg',
'4n81G-pmC3rfhmaPsbwYKg',
'iwGhazq9eP51PSerTrMrwg',
'R3TC2oq8fQK9c9BNMZ-ynA']:
rating = request.form[key]
ratings.append([key, int(rating)])
ratings_df = pd.DataFrame(ratings, columns=['b.id', 'r.stars'])
user_review_dist = ratings_df.merge(biz_cats, on='b.id')
biz_id = 'Os1n1_idfw9vv9kwULGJnQ'
business_review_dist = pd.read_pickle('data/business_review_dist')
biz_category_lookup = pd.read_pickle('data/biz_category_lookup')
user_category_lookup = pd.read_pickle('data/user_category_lookup')
pd.set_option('display.max_colwidth', -1)
predicted_ratings = [
(predict_rating(
driver,
user_cat_ids,
user_review_dist,
business_review_dist,
biz_category_lookup,
user_category_lookup,
x),
x) for x in sample_businesses['b.id']]
recommendations = pd.DataFrame(
predicted_ratings,
columns=[
'Predicted Rating',
'Restaurant']).sort_values(
'Predicted Rating',
ascending=False).head()
recommendations['Restaurant'] = [f'<a href="https://www.yelp.com/biz/{x}"\
target="_blank">{x}</a>' for x in recommendations['Restaurant']]
return recommendations.to_html(escape=False)
def predict_rating(
driver,
user_cat_ids,
user_review_dist,
business_review_dist,
biz_category_lookup,
user_category_lookup,
biz_id):
""" This function takes the user and business preference estimates and combines them
into a single estimate."""
biz_pref = biz_preference_demo(
driver,
user_cat_ids,
business_review_dist,
user_category_lookup,
biz_id)
user_pref = user_preference_demo(
driver, user_review_dist, biz_category_lookup, biz_id)
joint_prob = (biz_pref * user_pref) / sum(biz_pref * user_pref)
return expected_rating(joint_prob)
def cypher(driver, query, results_columns):
"""This is wrapper for sending basic cypher queries to a neo4j server. Input is a neo4j
connection driver, a string representing a cypher queryand a list of string for data frame
column names. It returns the dataframe of the results."""
with driver.session() as session:
result = session.run(query)
result_df = pd.DataFrame(result.values(), columns=results_columns)
return result_df
def expected_rating(rating_dist):
"""this takes a distribution of probabilities by rating from one to five and returns the
expected value of the rating"""
runsum = 0
for i in [1, 2, 3, 4, 5]:
runsum += rating_dist[i - 1] * i
return runsum
def biz_preference_demo(
driver,
user_cat_ids,
all_business_review_dist,
user_category_lookup,
biz_id):
"""This function uses the ratings of users similar to the target user who have rated the
target restaurant to estimate the target user's rating of the target restaurant."""
business_review_dist = all_business_review_dist.loc[all_business_review_dist['b.id']
== biz_id].drop_duplicates()
business_review_dist.set_index('u.id', inplace=True)
review_stars = business_review_dist['r.stars'].value_counts()
num_reviews = business_review_dist['r.stars'].shape[0]
user_in_cat = []
for cat in user_cat_ids:
all_users = user_category_lookup.loc[user_category_lookup['rep.id'] == cat]
users = business_review_dist.merge(
all_users, how='inner', right_on='u.id', left_index=True)
user_in_cat.append(users)
reviews_in_cat = []
for i in range(len(user_in_cat)):
reviews_in_cat.append(user_in_cat[i]['r.stars'])
numerator = np.empty(5)
for i in (1, 2, 3, 4, 5):
try:
numerator[i - 1] = review_stars[i]
except (IndexError, KeyError):
numerator[i - 1] = 0
PRu = (numerator + 1) / (num_reviews + 5)
num_cat = len(user_in_cat)
cats_by_stars = np.empty((num_cat, 5))
for i in range(num_cat):
if not reviews_in_cat[i].empty:
cat_stars = reviews_in_cat[i].value_counts()
for j in (1, 2, 3, 4, 5):
try:
cats_by_stars[i][j - 1] = cat_stars[j]
except (IndexError, KeyError):
cats_by_stars[i][j - 1] = 0
PRaj = ((cats_by_stars + 1) / (numerator + num_cat)).prod(axis=0)
# we now take the product of the distributions and normalize them so they
# sum to 1
biz_prefs_un_normalized = PRu * PRaj
biz_prefs = biz_prefs_un_normalized / sum(biz_prefs_un_normalized)
return biz_prefs
def user_preference_demo(
driver,
user_review_dist,
biz_category_lookup,
biz_id):
""" This function uses the review history of the target user to predict their rating of the
target restaurant based on their ratings of similar restaurants."""
# send a cypher query to the server that returns all of the biz's
# categories
categories_df = biz_category_lookup.loc[biz_category_lookup['b.id'] == biz_id]
cat_ids = set(categories_df['c.id'].values)
# these manipulate the biz categories and user's reviews for computation
# later
review_stars = user_review_dist['r.stars'].value_counts()
num_reviews = user_review_dist['r.stars'].shape[0]
# we initialize a blank list of businesses in the biz categories
biz_in_cat = []
for cat in cat_ids:
temp = []
for i in range(5):
if cat in user_review_dist['cats'].iloc[i]:
temp.append(user_review_dist['b.id'].iloc[i])
if temp:
biz_in_cat.append(temp)
reviews_in_cat = []
for i in range(len(biz_in_cat)):
# this loop goes through each biz category and sends a cypher query to get the reviews of
# businesses in that category by the user
sim_biz = []
for temp_biz in biz_in_cat[i]:
temp_rev = user_review_dist.loc[user_review_dist['b.id'] == temp_biz]
sim_biz.append(int(temp_rev['r.stars']))
reviews_in_cat.append(pd.DataFrame(sim_biz, columns=['r.stars']))
# this loop and PRu below uses laplace smoothing and the distribution of user's reviews
# to come up with naive bayes estimated probability distribution,
# prob(review from user = k)
numerator = np.empty(5)
for i in (1, 2, 3, 4, 5):
try:
numerator[i - 1] = review_stars[i]
except BaseException:
numerator[i - 1] = 0
PRu = (numerator + 1) / (num_reviews + 5)
# the code below uses laplace smoothing and the distribution of the biz reviews to come up with
# a naive bayes estimate of the distribution (prob review from user =
# k|given biz in category j)
num_cat = len(biz_in_cat)
cats_by_stars = np.empty((num_cat, 5))
for i in range(num_cat):
if not reviews_in_cat[i].empty:
cat_stars = reviews_in_cat[i]['r.stars'].value_counts()
for j in (1, 2, 3, 4, 5):
try:
cats_by_stars[i][j - 1] = cat_stars[j]
except BaseException:
cats_by_stars[i][j - 1] = 0
PRaj = ((cats_by_stars + 1) / (numerator + num_cat)).prod(axis=0)
# we now take the product of the distributions and normalize them so they
# sum to 1
user_prefs_un_normalized = PRu * PRaj
user_prefs = user_prefs_un_normalized / sum(user_prefs_un_normalized)
return user_prefs