협업 필터링 파이썬 코드 공유 (영화 추천 ver)

본 포스팅에선 파이썬 협업 필터링 실습을 위한 파이썬 코드를 공유합니다.

import pandas as pd
import numpy as np

In [2]:

#데이터 불러옴
rating_df = pd.read_csv( '/Users/Desktop/경로명/파일명', delimiter = "\t", header = None )

In [3]:

rating_df.head( 10 )

Out[3]:

	0	1	2	3
0	196	242	3	881250949
1	186	302	3	891717742
2	22	377	1	878887116
3	244	51	2	880606923
4	166	346	1	886397596
5	298	474	4	884182806
6	115	265	2	881171488
7	253	465	5	891628467
8	305	451	3	886324817
9	6	86	3	883603013

In [5]:

#열에 이름 붙이기 
rating_df.columns = ["userid", "movieid", "rating", "timestamp"]
rating_df.head( 10 )

Out[5]:

	userid	movieid	rating	timestamp
0	196	242	3	881250949
1	186	302	3	891717742
2	22	377	1	878887116
3	244	51	2	880606923
4	166	346	1	886397596
5	298	474	4	884182806
6	115	265	2	881171488
7	253	465	5	891628467
8	305	451	3	886324817
9	6	86	3	883603013

In [6]:

#유저의 수
len( rating_df.userid.unique() )

Out[6]:

In [7]:

#영화의 수 
len( rating_df.movieid.unique() )

Out[7]:

In [8]:

#timestamp열 필요없으니까 버리기
rating_df.drop( "timestamp", inplace = True, axis = 1 )
rating_df.head( 10 )

Out[8]:

	userid	movieid	rating
0	196	242	3
1	186	302	3
2	22	377	1
3	244	51	2
4	166	346	1
5	298	474	4
6	115	265	2
7	253	465	5
8	305	451	3
9	6	86	3

In [11]:

#영화 데이터 가져오기 
movies_df = pd.read_csv( '/Users/lee/Desktop/마통자료/ml-100k/u.item', delimiter = '\|', header = None )

/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  """Entry point for launching an IPython kernel.

In [13]:

movies_df = movies_df.iloc[:,:2]
movies_df.columns = ['movieid', 'title']
movies_df.head( 10 )

Out[13]:

	movieid	title
0	1	Toy Story (1995)
1	2	GoldenEye (1995)
2	3	Four Rooms (1995)
3	4	Get Shorty (1995)
4	5	Copycat (1995)
5	6	Shanghai Triad (Yao a yao yao dao waipo qiao) …
6	7	Twelve Monkeys (1995)
7	8	Babe (1995)
8	9	Dead Man Walking (1995)
9	10	Richard III (1995)

In [14]:

#유사한 유저 찾기
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [15]:

#피벗테이블만들기
user_movies_df = rating_df.pivot( index='userid', columns='movieid', values = "rating" ).reset_index(drop=True)

In [16]:

#평점 없는 데이터 0으로 채우기
user_movies_df.fillna( 0, inplace = True )

In [17]:

user_movies_df.shape

Out[17]:

(943, 1682)

In [18]:

#데이터 상태확인
user_movies_df.iloc[10:20, 20:30]

Out[18]:

movieid	21	22	23	24	25	26	27	28	29
10	0.0	4.0	0.0	3.0	3.0	0.0	0.0	5.0	3.0
11	0.0	0.0	0.0	0.0	0.0	0.0	0.0	5.0	0.0
12	3.0	4.0	5.0	1.0	1.0	0.0	3.0	5.0	2.0
13	0.0	3.0	5.0	0.0	2.0	0.0	0.0	0.0	0.0
14	0.0	0.0	0.0	0.0	3.0	0.0	0.0	0.0	0.0
15	0.0	5.0	0.0	0.0	0.0	0.0	2.0	5.0	0.0
16	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
17	0.0	5.0	4.0	0.0	3.0	4.0	0.0	3.0	0.0
18	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
19	0.0	5.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

In [19]:

#부여한 평점을 바탕으로 거리계산 
#유클리드, 코사인, 피어슨 상관이 가장 널리 사용되는 유사도 계수
#여기서는 코사인 거리 사용해서 계산
user_sim = 1 - pairwise_distances( user_movies_df.as_matrix(), metric="cosine" )

In [20]:

user_sim_df = pd.DataFrame( user_sim )

In [21]:

#평점과 영화아이디에 따른 코사인 유사도 결과
#1에 가까울수록 유사함
user_sim_df[0:5]

Out[21]:

	0	1	2	3	4	5	6	7	8	9	…	933	934	935	936	937	938	939	940	941	942
0	1.000000	0.166931	0.047460	0.064358	0.378475	0.430239	0.440367	0.319072	0.078138	0.376544	…	0.369527	0.119482	0.274876	0.189705	0.197326	0.118095	0.314072	0.148617	0.179508	0.398175
1	0.166931	1.000000	0.110591	0.178121	0.072979	0.245843	0.107328	0.103344	0.161048	0.159862	…	0.156986	0.307942	0.358789	0.424046	0.319889	0.228583	0.226790	0.161485	0.172268	0.105798
2	0.047460	0.110591	1.000000	0.344151	0.021245	0.072415	0.066137	0.083060	0.061040	0.065151	…	0.031875	0.042753	0.163829	0.069038	0.124245	0.026271	0.161890	0.101243	0.133416	0.026556
3	0.064358	0.178121	0.344151	1.000000	0.031804	0.068044	0.091230	0.188060	0.101284	0.060859	…	0.052107	0.036784	0.133115	0.193471	0.146058	0.030138	0.196858	0.152041	0.170086	0.058752
4	0.378475	0.072979	0.021245	0.031804	1.000000	0.237286	0.373600	0.248930	0.056847	0.201427	…	0.338794	0.080580	0.094924	0.079779	0.148607	0.071459	0.239955	0.139595	0.152497	0.313941

5 rows × 943 columns

In [22]:

#누구와 누구가 비슷할까?
#유저가 자기자신과 가장 유사한 것을 보여줌 
user_sim_df.idxmax(axis=1)[0:5]

Out[22]:

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [23]:

#따라서 자기와의 상관관계를 0으로 설정하는것이 필요
np.fill_diagonal( user_sim, 0 )

In [24]:

user_sim_df = pd.DataFrame( user_sim )

In [25]:

#0으로바꿔준 결과
user_sim_df[0:5]

Out[25]:

	0	1	2	3	4	5	6	7	8	9	…	933	934	935	936	937	938	939	940	941	942
0	0.000000	0.166931	0.047460	0.064358	0.378475	0.430239	0.440367	0.319072	0.078138	0.376544	…	0.369527	0.119482	0.274876	0.189705	0.197326	0.118095	0.314072	0.148617	0.179508	0.398175
1	0.166931	0.000000	0.110591	0.178121	0.072979	0.245843	0.107328	0.103344	0.161048	0.159862	…	0.156986	0.307942	0.358789	0.424046	0.319889	0.228583	0.226790	0.161485	0.172268	0.105798
2	0.047460	0.110591	0.000000	0.344151	0.021245	0.072415	0.066137	0.083060	0.061040	0.065151	…	0.031875	0.042753	0.163829	0.069038	0.124245	0.026271	0.161890	0.101243	0.133416	0.026556
3	0.064358	0.178121	0.344151	0.000000	0.031804	0.068044	0.091230	0.188060	0.101284	0.060859	…	0.052107	0.036784	0.133115	0.193471	0.146058	0.030138	0.196858	0.152041	0.170086	0.058752
4	0.378475	0.072979	0.021245	0.031804	0.000000	0.237286	0.373600	0.248930	0.056847	0.201427	…	0.338794	0.080580	0.094924	0.079779	0.148607	0.071459	0.239955	0.139595	0.152497	0.313941

5 rows × 943 columns

In [26]:

#누가 누구랑 비슷할까?
#545유저와 757유저와 비슷함
#실제 유저 ID는 색인번호+1이됨
user_sim_df.idxmax(axis=1).sample( 10, random_state = 10 )

Out[26]:

544    756
309    246
448    893
628    537
284    413
572    693
225    866
567    311
75     176
726    496
dtype: int64

In [27]:

#유사한 유저가 실제로 비슷한 평점부여했는가 확인하기
def get_user_similar_movies( user1, user2 ):
  common_movies = rating_df[rating_df.userid == user1].merge(
      rating_df[rating_df.userid == user2],
      on = "movieid",
      how = "inner" )

  return common_movies.merge( movies_df, on = 'movieid' )

In [28]:

#310번유저와 247번유저 비교
get_user_similar_movies( 310, 247 )

Out[28]:

	userid_x	movieid	rating_x	userid_y	rating_y	title
0	310	258	3	247	5	Contact (1997)
1	310	257	5	247	4	Men in Black (1997)
2	310	251	5	247	4	Shall We Dance? (1996)
3	310	1022	5	247	4	Fast, Cheap & Out of Control (1997)
4	310	222	3	247	3	Star Trek: First Contact (1996)
5	310	181	4	247	4	Return of the Jedi (1983)
6	310	50	5	247	5	Star Wars (1977)

In [29]:

#아이템 유사성 찾기
#영화-사용자 피벗테이블 만들기
rating_mat = rating_df.pivot( index='movieid', columns='userid', values = "rating" ).reset_index(drop=True)

In [30]:

#평점없는 것들은 0으로 채우기
rating_mat.fillna( 0, inplace = True )

In [31]:

rating_mat.shape

Out[31]:

(1682, 943)

In [32]:

#행 : 영화
#열 : 사용자
#사용자가 영화 평점한 표
rating_mat.head( 10 )

Out[32]:

userid	1	2	5	6	7	8	9	10	…	934	935	936	937	938	939	940	941	943
0	5.0	4.0	4.0	4.0	0.0	0.0	0.0	4.0	…	2.0	3.0	4.0	0.0	4.0	0.0	0.0	5.0	0.0
1	3.0	0.0	3.0	0.0	0.0	0.0	0.0	0.0	…	4.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	5.0
2	4.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	…	0.0	0.0	4.0	0.0	0.0	0.0	0.0	0.0	0.0
3	3.0	0.0	0.0	0.0	5.0	0.0	0.0	4.0	…	5.0	0.0	0.0	0.0	0.0	0.0	2.0	0.0	0.0
4	3.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	…	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5	5.0	0.0	0.0	0.0	0.0	0.0	5.0	0.0	…	0.0	0.0	5.0	0.0	0.0	0.0	0.0	0.0	0.0
6	4.0	0.0	0.0	2.0	5.0	3.0	4.0	4.0	…	0.0	0.0	4.0	0.0	4.0	0.0	4.0	4.0	0.0
7	1.0	0.0	0.0	4.0	5.0	0.0	0.0	0.0	…	0.0	0.0	0.0	0.0	0.0	0.0	5.0	0.0	0.0
8	5.0	0.0	0.0	4.0	5.0	0.0	0.0	4.0	…	0.0	1.0	4.0	5.0	3.0	5.0	3.0	0.0	3.0
9	3.0	2.0	0.0	0.0	4.0	0.0	0.0	0.0	…	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

10 rows × 943 columns

In [33]:

#유사성 계산
movie_sim = 1 - pairwise_distances( rating_mat.as_matrix(), metric="correlation" )

In [34]:

movie_sim.shape

Out[34]:

(1682, 1682)

In [35]:

movie_sim_df = pd.DataFrame( movie_sim )

In [36]:

movie_sim_df.head( 10 )

Out[36]:

	0	1	2	3	4	5	6	7	8	9	…	1672	1673	1674	1675	1676	1677	1678	1679	1680	1681
0	1.000000	0.234595	0.193362	0.226213	0.128840	0.015113	0.347354	0.254490	0.209502	0.104655	…	0.018215	-0.029676	-0.029676	-0.029676	0.018215	-0.029676	-0.029676	-0.029676	0.034179	0.034179
1	0.234595	1.000000	0.190649	0.409044	0.240712	0.030062	0.220022	0.206020	0.077894	0.072906	…	-0.012451	-0.012451	-0.012451	-0.012451	-0.012451	-0.012451	-0.012451	-0.012451	0.071415	0.071415
2	0.193362	0.190649	1.000000	0.227849	0.141368	0.065347	0.258855	0.078636	0.146181	0.079608	…	-0.009764	-0.009764	-0.009764	-0.009764	0.023964	-0.009764	-0.009764	-0.009764	-0.009764	0.091421
3	0.226213	0.409044	0.227849	1.000000	0.237298	0.021878	0.295489	0.352800	0.229922	0.138220	…	-0.016619	-0.016619	0.088984	0.088984	0.025622	-0.016619	-0.016619	-0.016619	0.046743	0.067863
4	0.128840	0.240712	0.141368	0.237298	1.000000	-0.008594	0.205289	0.145866	0.142541	-0.033746	…	-0.009889	-0.009889	-0.009889	-0.009889	-0.009889	-0.009889	-0.009889	-0.009889	-0.009889	0.088618
5	0.015113	0.030062	0.065347	0.021878	-0.008594	1.000000	0.054415	0.012330	0.079619	0.166084	…	-0.005159	-0.005159	-0.005159	-0.005159	-0.005159	-0.005159	-0.005159	-0.005159	-0.005159	-0.005159
6	0.347354	0.220022	0.258855	0.295489	0.205289	0.054415	1.000000	0.190670	0.286572	0.178505	…	-0.026036	0.039920	-0.026036	-0.026036	0.039920	-0.026036	-0.026036	-0.026036	0.039920	0.039920
7	0.254490	0.206020	0.078636	0.352800	0.145866	0.012330	0.190670	1.000000	0.229331	0.152679	…	-0.017230	0.075617	0.057047	0.057047	0.075617	-0.017230	-0.017230	-0.017230	0.075617	-0.017230
8	0.209502	0.077894	0.146181	0.229922	0.142541	0.079619	0.286572	0.229331	1.000000	0.158373	…	-0.021125	-0.021125	0.047273	0.047273	0.064372	-0.021125	-0.021125	-0.021125	0.047273	0.064372
9	0.104655	0.072906	0.079608	0.138220	-0.033746	0.166084	0.178505	0.152679	0.158373	1.000000	…	-0.010138	-0.010138	0.073967	0.073967	-0.010138	-0.010138	-0.010138	-0.010138	-0.010138	-0.010138

10 rows × 1682 columns

In [37]:

#토이스토리를 기준으로 영화별 유사성 표시
#토이스토리와 얼마나 비슷한 영화인지 모든 영화에 대해 알려줌 
movies_df['similarity'] = movie_sim_df.iloc[0]
movies_df.columns = ['movieid', 'title', 'similarity']
movies_df.head( 10 )

Out[37]:

	movieid	title	similarity
0	1	Toy Story (1995)	1.000000
1	2	GoldenEye (1995)	0.234595
2	3	Four Rooms (1995)	0.193362
3	4	Get Shorty (1995)	0.226213
4	5	Copycat (1995)	0.128840
5	6	Shanghai Triad (Yao a yao yao dao waipo qiao) …	0.015113
6	7	Twelve Monkeys (1995)	0.347354
7	8	Babe (1995)	0.254490
8	9	Dead Man Walking (1995)	0.209502
9	10	Richard III (1995)	0.104655

In [43]:

#Four Rooms를 기준으로 영화별 유사성 표시
movies_df['similarity'] = movie_sim_df.iloc[2]
movies_df.columns = ['movieid', 'title', 'similarity']
movies_df.head( 10 )

Out[43]:

	movieid	title	similarity
0	1	Toy Story (1995)	0.193362
1	2	GoldenEye (1995)	0.190649
2	3	Four Rooms (1995)	1.000000
3	4	Get Shorty (1995)	0.227849
4	5	Copycat (1995)	0.141368
5	6	Shanghai Triad (Yao a yao yao dao waipo qiao) …	0.065347
6	7	Twelve Monkeys (1995)	0.258855
7	8	Babe (1995)	0.078636
8	9	Dead Man Walking (1995)	0.146181
9	10	Richard III (1995)	0.079608

In [38]:

#유사한 영화 찾아주는 기능 
#id number를 입력하면 유사한 순서대로 찾아줌 
def get_similar_movies( movieid, topN = 5 ):
  movies_df['similarity'] = movie_sim_df.iloc[movieid -1]
  top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]
  print( "Similar Movies to: ", )
  return top_n

In [39]:

get_similar_movies( 228 )

Similar Movies to:

Out[39]:

	movieid	title	similarity
227	228	Star Trek: The Wrath of Khan (1982)	1.000000
228	229	Star Trek III: The Search for Spock (1984)	0.747498
229	230	Star Trek IV: The Voyage Home (1986)	0.723112
226	227	Star Trek VI: The Undiscovered Country (1991)	0.685605
175	176	Aliens (1986)	0.590461