본 포스팅에선 파이썬 협업 필터링 실습을 위한 파이썬 코드를 공유합니다.
import pandas as pd import numpy as np
In [2]:
#데이터 불러옴 rating_df = pd.read_csv( '/Users/Desktop/경로명/파일명', delimiter = "\t", header = None )
In [3]:
rating_df.head( 10 )
Out[3]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 196 | 242 | 3 | 881250949 |
1 | 186 | 302 | 3 | 891717742 |
2 | 22 | 377 | 1 | 878887116 |
3 | 244 | 51 | 2 | 880606923 |
4 | 166 | 346 | 1 | 886397596 |
5 | 298 | 474 | 4 | 884182806 |
6 | 115 | 265 | 2 | 881171488 |
7 | 253 | 465 | 5 | 891628467 |
8 | 305 | 451 | 3 | 886324817 |
9 | 6 | 86 | 3 | 883603013 |
In [5]:
#열에 이름 붙이기 rating_df.columns = ["userid", "movieid", "rating", "timestamp"] rating_df.head( 10 )
Out[5]:
userid | movieid | rating | timestamp | |
---|---|---|---|---|
0 | 196 | 242 | 3 | 881250949 |
1 | 186 | 302 | 3 | 891717742 |
2 | 22 | 377 | 1 | 878887116 |
3 | 244 | 51 | 2 | 880606923 |
4 | 166 | 346 | 1 | 886397596 |
5 | 298 | 474 | 4 | 884182806 |
6 | 115 | 265 | 2 | 881171488 |
7 | 253 | 465 | 5 | 891628467 |
8 | 305 | 451 | 3 | 886324817 |
9 | 6 | 86 | 3 | 883603013 |
In [6]:
#유저의 수 len( rating_df.userid.unique() )
Out[6]:
943
In [7]:
#영화의 수 len( rating_df.movieid.unique() )
Out[7]:
1682
In [8]:
#timestamp열 필요없으니까 버리기 rating_df.drop( "timestamp", inplace = True, axis = 1 ) rating_df.head( 10 )
Out[8]:
userid | movieid | rating | |
---|---|---|---|
0 | 196 | 242 | 3 |
1 | 186 | 302 | 3 |
2 | 22 | 377 | 1 |
3 | 244 | 51 | 2 |
4 | 166 | 346 | 1 |
5 | 298 | 474 | 4 |
6 | 115 | 265 | 2 |
7 | 253 | 465 | 5 |
8 | 305 | 451 | 3 |
9 | 6 | 86 | 3 |
In [11]:
#영화 데이터 가져오기 movies_df = pd.read_csv( '/Users/lee/Desktop/마통자료/ml-100k/u.item', delimiter = '\|', header = None )
/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'. """Entry point for launching an IPython kernel.
In [13]:
movies_df = movies_df.iloc[:,:2] movies_df.columns = ['movieid', 'title'] movies_df.head( 10 )
Out[13]:
movieid | title | |
---|---|---|
0 | 1 | Toy Story (1995) |
1 | 2 | GoldenEye (1995) |
2 | 3 | Four Rooms (1995) |
3 | 4 | Get Shorty (1995) |
4 | 5 | Copycat (1995) |
5 | 6 | Shanghai Triad (Yao a yao yao dao waipo qiao) … |
6 | 7 | Twelve Monkeys (1995) |
7 | 8 | Babe (1995) |
8 | 9 | Dead Man Walking (1995) |
9 | 10 | Richard III (1995) |
In [14]:
#유사한 유저 찾기 from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cosine, correlation
In [15]:
#피벗테이블만들기 user_movies_df = rating_df.pivot( index='userid', columns='movieid', values = "rating" ).reset_index(drop=True)
In [16]:
#평점 없는 데이터 0으로 채우기 user_movies_df.fillna( 0, inplace = True )
In [17]:
user_movies_df.shape
Out[17]:
(943, 1682)
In [18]:
#데이터 상태확인 user_movies_df.iloc[10:20, 20:30]
Out[18]:
movieid | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
---|---|---|---|---|---|---|---|---|---|---|
10 | 0.0 | 4.0 | 0.0 | 3.0 | 3.0 | 0.0 | 0.0 | 5.0 | 3.0 | 0.0 |
11 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 0.0 | 0.0 |
12 | 3.0 | 4.0 | 5.0 | 1.0 | 1.0 | 0.0 | 3.0 | 5.0 | 2.0 | 0.0 |
13 | 0.0 | 3.0 | 5.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
14 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
15 | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 5.0 | 0.0 | 0.0 |
16 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
17 | 0.0 | 5.0 | 4.0 | 0.0 | 3.0 | 4.0 | 0.0 | 3.0 | 0.0 | 0.0 |
18 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
19 | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
In [19]:
#부여한 평점을 바탕으로 거리계산 #유클리드, 코사인, 피어슨 상관이 가장 널리 사용되는 유사도 계수 #여기서는 코사인 거리 사용해서 계산 user_sim = 1 - pairwise_distances( user_movies_df.as_matrix(), metric="cosine" )
In [20]:
user_sim_df = pd.DataFrame( user_sim )
In [21]:
#평점과 영화아이디에 따른 코사인 유사도 결과 #1에 가까울수록 유사함 user_sim_df[0:5]
Out[21]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | … | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.000000 | 0.166931 | 0.047460 | 0.064358 | 0.378475 | 0.430239 | 0.440367 | 0.319072 | 0.078138 | 0.376544 | … | 0.369527 | 0.119482 | 0.274876 | 0.189705 | 0.197326 | 0.118095 | 0.314072 | 0.148617 | 0.179508 | 0.398175 |
1 | 0.166931 | 1.000000 | 0.110591 | 0.178121 | 0.072979 | 0.245843 | 0.107328 | 0.103344 | 0.161048 | 0.159862 | … | 0.156986 | 0.307942 | 0.358789 | 0.424046 | 0.319889 | 0.228583 | 0.226790 | 0.161485 | 0.172268 | 0.105798 |
2 | 0.047460 | 0.110591 | 1.000000 | 0.344151 | 0.021245 | 0.072415 | 0.066137 | 0.083060 | 0.061040 | 0.065151 | … | 0.031875 | 0.042753 | 0.163829 | 0.069038 | 0.124245 | 0.026271 | 0.161890 | 0.101243 | 0.133416 | 0.026556 |
3 | 0.064358 | 0.178121 | 0.344151 | 1.000000 | 0.031804 | 0.068044 | 0.091230 | 0.188060 | 0.101284 | 0.060859 | … | 0.052107 | 0.036784 | 0.133115 | 0.193471 | 0.146058 | 0.030138 | 0.196858 | 0.152041 | 0.170086 | 0.058752 |
4 | 0.378475 | 0.072979 | 0.021245 | 0.031804 | 1.000000 | 0.237286 | 0.373600 | 0.248930 | 0.056847 | 0.201427 | … | 0.338794 | 0.080580 | 0.094924 | 0.079779 | 0.148607 | 0.071459 | 0.239955 | 0.139595 | 0.152497 | 0.313941 |
5 rows × 943 columns
In [22]:
#누구와 누구가 비슷할까? #유저가 자기자신과 가장 유사한 것을 보여줌 user_sim_df.idxmax(axis=1)[0:5]
Out[22]:
0 0 1 1 2 2 3 3 4 4 dtype: int64
In [23]:
#따라서 자기와의 상관관계를 0으로 설정하는것이 필요 np.fill_diagonal( user_sim, 0 )
In [24]:
user_sim_df = pd.DataFrame( user_sim )
In [25]:
#0으로바꿔준 결과 user_sim_df[0:5]
Out[25]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | … | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000000 | 0.166931 | 0.047460 | 0.064358 | 0.378475 | 0.430239 | 0.440367 | 0.319072 | 0.078138 | 0.376544 | … | 0.369527 | 0.119482 | 0.274876 | 0.189705 | 0.197326 | 0.118095 | 0.314072 | 0.148617 | 0.179508 | 0.398175 |
1 | 0.166931 | 0.000000 | 0.110591 | 0.178121 | 0.072979 | 0.245843 | 0.107328 | 0.103344 | 0.161048 | 0.159862 | … | 0.156986 | 0.307942 | 0.358789 | 0.424046 | 0.319889 | 0.228583 | 0.226790 | 0.161485 | 0.172268 | 0.105798 |
2 | 0.047460 | 0.110591 | 0.000000 | 0.344151 | 0.021245 | 0.072415 | 0.066137 | 0.083060 | 0.061040 | 0.065151 | … | 0.031875 | 0.042753 | 0.163829 | 0.069038 | 0.124245 | 0.026271 | 0.161890 | 0.101243 | 0.133416 | 0.026556 |
3 | 0.064358 | 0.178121 | 0.344151 | 0.000000 | 0.031804 | 0.068044 | 0.091230 | 0.188060 | 0.101284 | 0.060859 | … | 0.052107 | 0.036784 | 0.133115 | 0.193471 | 0.146058 | 0.030138 | 0.196858 | 0.152041 | 0.170086 | 0.058752 |
4 | 0.378475 | 0.072979 | 0.021245 | 0.031804 | 0.000000 | 0.237286 | 0.373600 | 0.248930 | 0.056847 | 0.201427 | … | 0.338794 | 0.080580 | 0.094924 | 0.079779 | 0.148607 | 0.071459 | 0.239955 | 0.139595 | 0.152497 | 0.313941 |
5 rows × 943 columns
In [26]:
#누가 누구랑 비슷할까? #545유저와 757유저와 비슷함 #실제 유저 ID는 색인번호+1이됨 user_sim_df.idxmax(axis=1).sample( 10, random_state = 10 )
Out[26]:
544 756 309 246 448 893 628 537 284 413 572 693 225 866 567 311 75 176 726 496 dtype: int64
In [27]:
#유사한 유저가 실제로 비슷한 평점부여했는가 확인하기 def get_user_similar_movies( user1, user2 ): common_movies = rating_df[rating_df.userid == user1].merge( rating_df[rating_df.userid == user2], on = "movieid", how = "inner" ) return common_movies.merge( movies_df, on = 'movieid' )
In [28]:
#310번유저와 247번유저 비교 get_user_similar_movies( 310, 247 )
Out[28]:
userid_x | movieid | rating_x | userid_y | rating_y | title | |
---|---|---|---|---|---|---|
0 | 310 | 258 | 3 | 247 | 5 | Contact (1997) |
1 | 310 | 257 | 5 | 247 | 4 | Men in Black (1997) |
2 | 310 | 251 | 5 | 247 | 4 | Shall We Dance? (1996) |
3 | 310 | 1022 | 5 | 247 | 4 | Fast, Cheap & Out of Control (1997) |
4 | 310 | 222 | 3 | 247 | 3 | Star Trek: First Contact (1996) |
5 | 310 | 181 | 4 | 247 | 4 | Return of the Jedi (1983) |
6 | 310 | 50 | 5 | 247 | 5 | Star Wars (1977) |
In [29]:
#아이템 유사성 찾기 #영화-사용자 피벗테이블 만들기 rating_mat = rating_df.pivot( index='movieid', columns='userid', values = "rating" ).reset_index(drop=True)
In [30]:
#평점없는 것들은 0으로 채우기 rating_mat.fillna( 0, inplace = True )
In [31]:
rating_mat.shape
Out[31]:
(1682, 943)
In [32]:
#행 : 영화 #열 : 사용자 #사용자가 영화 평점한 표 rating_mat.head( 10 )
Out[32]:
userid | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | … | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5.0 | 4.0 | 0.0 | 0.0 | 4.0 | 4.0 | 0.0 | 0.0 | 0.0 | 4.0 | … | 2.0 | 3.0 | 4.0 | 0.0 | 4.0 | 0.0 | 0.0 | 5.0 | 0.0 | 0.0 |
1 | 3.0 | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 |
2 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 0.0 | 0.0 | 4.0 | … | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 |
4 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 0.0 | … | 0.0 | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
6 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 5.0 | 3.0 | 4.0 | 4.0 | … | 0.0 | 0.0 | 4.0 | 0.0 | 4.0 | 0.0 | 4.0 | 4.0 | 0.0 | 0.0 |
7 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 5.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 |
8 | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 5.0 | 0.0 | 0.0 | 4.0 | … | 0.0 | 1.0 | 4.0 | 5.0 | 3.0 | 5.0 | 3.0 | 0.0 | 0.0 | 3.0 |
9 | 3.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | … | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
10 rows × 943 columns
In [33]:
#유사성 계산 movie_sim = 1 - pairwise_distances( rating_mat.as_matrix(), metric="correlation" )
In [34]:
movie_sim.shape
Out[34]:
(1682, 1682)
In [35]:
movie_sim_df = pd.DataFrame( movie_sim )
In [36]:
movie_sim_df.head( 10 )
Out[36]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | … | 1672 | 1673 | 1674 | 1675 | 1676 | 1677 | 1678 | 1679 | 1680 | 1681 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.000000 | 0.234595 | 0.193362 | 0.226213 | 0.128840 | 0.015113 | 0.347354 | 0.254490 | 0.209502 | 0.104655 | … | 0.018215 | -0.029676 | -0.029676 | -0.029676 | 0.018215 | -0.029676 | -0.029676 | -0.029676 | 0.034179 | 0.034179 |
1 | 0.234595 | 1.000000 | 0.190649 | 0.409044 | 0.240712 | 0.030062 | 0.220022 | 0.206020 | 0.077894 | 0.072906 | … | -0.012451 | -0.012451 | -0.012451 | -0.012451 | -0.012451 | -0.012451 | -0.012451 | -0.012451 | 0.071415 | 0.071415 |
2 | 0.193362 | 0.190649 | 1.000000 | 0.227849 | 0.141368 | 0.065347 | 0.258855 | 0.078636 | 0.146181 | 0.079608 | … | -0.009764 | -0.009764 | -0.009764 | -0.009764 | 0.023964 | -0.009764 | -0.009764 | -0.009764 | -0.009764 | 0.091421 |
3 | 0.226213 | 0.409044 | 0.227849 | 1.000000 | 0.237298 | 0.021878 | 0.295489 | 0.352800 | 0.229922 | 0.138220 | … | -0.016619 | -0.016619 | 0.088984 | 0.088984 | 0.025622 | -0.016619 | -0.016619 | -0.016619 | 0.046743 | 0.067863 |
4 | 0.128840 | 0.240712 | 0.141368 | 0.237298 | 1.000000 | -0.008594 | 0.205289 | 0.145866 | 0.142541 | -0.033746 | … | -0.009889 | -0.009889 | -0.009889 | -0.009889 | -0.009889 | -0.009889 | -0.009889 | -0.009889 | -0.009889 | 0.088618 |
5 | 0.015113 | 0.030062 | 0.065347 | 0.021878 | -0.008594 | 1.000000 | 0.054415 | 0.012330 | 0.079619 | 0.166084 | … | -0.005159 | -0.005159 | -0.005159 | -0.005159 | -0.005159 | -0.005159 | -0.005159 | -0.005159 | -0.005159 | -0.005159 |
6 | 0.347354 | 0.220022 | 0.258855 | 0.295489 | 0.205289 | 0.054415 | 1.000000 | 0.190670 | 0.286572 | 0.178505 | … | -0.026036 | 0.039920 | -0.026036 | -0.026036 | 0.039920 | -0.026036 | -0.026036 | -0.026036 | 0.039920 | 0.039920 |
7 | 0.254490 | 0.206020 | 0.078636 | 0.352800 | 0.145866 | 0.012330 | 0.190670 | 1.000000 | 0.229331 | 0.152679 | … | -0.017230 | 0.075617 | 0.057047 | 0.057047 | 0.075617 | -0.017230 | -0.017230 | -0.017230 | 0.075617 | -0.017230 |
8 | 0.209502 | 0.077894 | 0.146181 | 0.229922 | 0.142541 | 0.079619 | 0.286572 | 0.229331 | 1.000000 | 0.158373 | … | -0.021125 | -0.021125 | 0.047273 | 0.047273 | 0.064372 | -0.021125 | -0.021125 | -0.021125 | 0.047273 | 0.064372 |
9 | 0.104655 | 0.072906 | 0.079608 | 0.138220 | -0.033746 | 0.166084 | 0.178505 | 0.152679 | 0.158373 | 1.000000 | … | -0.010138 | -0.010138 | 0.073967 | 0.073967 | -0.010138 | -0.010138 | -0.010138 | -0.010138 | -0.010138 | -0.010138 |
10 rows × 1682 columns
In [37]:
#토이스토리를 기준으로 영화별 유사성 표시 #토이스토리와 얼마나 비슷한 영화인지 모든 영화에 대해 알려줌 movies_df['similarity'] = movie_sim_df.iloc[0] movies_df.columns = ['movieid', 'title', 'similarity'] movies_df.head( 10 )
Out[37]:
movieid | title | similarity | |
---|---|---|---|
0 | 1 | Toy Story (1995) | 1.000000 |
1 | 2 | GoldenEye (1995) | 0.234595 |
2 | 3 | Four Rooms (1995) | 0.193362 |
3 | 4 | Get Shorty (1995) | 0.226213 |
4 | 5 | Copycat (1995) | 0.128840 |
5 | 6 | Shanghai Triad (Yao a yao yao dao waipo qiao) … | 0.015113 |
6 | 7 | Twelve Monkeys (1995) | 0.347354 |
7 | 8 | Babe (1995) | 0.254490 |
8 | 9 | Dead Man Walking (1995) | 0.209502 |
9 | 10 | Richard III (1995) | 0.104655 |
In [43]:
#Four Rooms를 기준으로 영화별 유사성 표시 movies_df['similarity'] = movie_sim_df.iloc[2] movies_df.columns = ['movieid', 'title', 'similarity'] movies_df.head( 10 )
Out[43]:
movieid | title | similarity | |
---|---|---|---|
0 | 1 | Toy Story (1995) | 0.193362 |
1 | 2 | GoldenEye (1995) | 0.190649 |
2 | 3 | Four Rooms (1995) | 1.000000 |
3 | 4 | Get Shorty (1995) | 0.227849 |
4 | 5 | Copycat (1995) | 0.141368 |
5 | 6 | Shanghai Triad (Yao a yao yao dao waipo qiao) … | 0.065347 |
6 | 7 | Twelve Monkeys (1995) | 0.258855 |
7 | 8 | Babe (1995) | 0.078636 |
8 | 9 | Dead Man Walking (1995) | 0.146181 |
9 | 10 | Richard III (1995) | 0.079608 |
In [38]:
#유사한 영화 찾아주는 기능 #id number를 입력하면 유사한 순서대로 찾아줌 def get_similar_movies( movieid, topN = 5 ): movies_df['similarity'] = movie_sim_df.iloc[movieid -1] top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN] print( "Similar Movies to: ", ) return top_n
In [39]:
get_similar_movies( 228 )
Similar Movies to:
Out[39]:
movieid | title | similarity | |
---|---|---|---|
227 | 228 | Star Trek: The Wrath of Khan (1982) | 1.000000 |
228 | 229 | Star Trek III: The Search for Spock (1984) | 0.747498 |
229 | 230 | Star Trek IV: The Voyage Home (1986) | 0.723112 |
226 | 227 | Star Trek VI: The Undiscovered Country (1991) | 0.685605 |
175 | 176 | Aliens (1986) | 0.590461 |