-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuser_median.py
60 lines (48 loc) · 1.73 KB
/
user_median.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import csv
# Predict via the user-specific median.
# If the user has no data, use the global median.
train_file = 'train.csv'
test_file = 'test.csv'
soln_file = 'user_median.csv'
# Load the training data.
train_data = {}
with open(train_file, 'r') as train_fh:
train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
next(train_csv, None)
for row in train_csv:
user = row[0]
artist = row[1]
plays = row[2]
if not user in train_data:
train_data[user] = {}
train_data[user][artist] = int(plays)
# Compute the global median and per-user median.
plays_array = []
user_medians = {}
for user, user_data in train_data.iteritems():
user_plays = []
for artist, plays in user_data.iteritems():
plays_array.append(plays)
user_plays.append(plays)
user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))
# Write out test solutions.
with open(test_file, 'r') as test_fh:
test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
next(test_csv, None)
with open(soln_file, 'w') as soln_fh:
soln_csv = csv.writer(soln_fh,
delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL)
soln_csv.writerow(['Id', 'plays'])
for row in test_csv:
id = row[0]
user = row[1]
artist = row[2]
if user in user_medians:
soln_csv.writerow([id, user_medians[user]])
else:
print "User", id, "not in training data."
soln_csv.writerow([id, global_median])