협업 필터링 파이썬 코드 공유 (영화 추천 ver)

본 포스팅에선 파이썬 협업 필터링 실습을 위한 파이썬 코드를 공유합니다.

import pandas as pd
import numpy as np

In [2]:

#데이터 불러옴
rating_df = pd.read_csv( '/Users/Desktop/경로명/파일명', delimiter = "\t", header = None )

In [3]:

rating_df.head( 10 )

Out[3]:

0123
01962423881250949
11863023891717742
2223771878887116
3244512880606923
41663461886397596
52984744884182806
61152652881171488
72534655891628467
83054513886324817
96863883603013

In [5]:

#열에 이름 붙이기 
rating_df.columns = ["userid", "movieid", "rating", "timestamp"]
rating_df.head( 10 )

Out[5]:

useridmovieidratingtimestamp
01962423881250949
11863023891717742
2223771878887116
3244512880606923
41663461886397596
52984744884182806
61152652881171488
72534655891628467
83054513886324817
96863883603013

In [6]:

#유저의 수
len( rating_df.userid.unique() )

Out[6]:

943

In [7]:

#영화의 수 
len( rating_df.movieid.unique() )

Out[7]:

1682

In [8]:

#timestamp열 필요없으니까 버리기
rating_df.drop( "timestamp", inplace = True, axis = 1 )
rating_df.head( 10 )

Out[8]:

useridmovieidrating
01962423
11863023
2223771
3244512
41663461
52984744
61152652
72534655
83054513
96863

In [11]:

#영화 데이터 가져오기 
movies_df = pd.read_csv( '/Users/lee/Desktop/마통자료/ml-100k/u.item', delimiter = '\|', header = None )
/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  """Entry point for launching an IPython kernel.

In [13]:

movies_df = movies_df.iloc[:,:2]
movies_df.columns = ['movieid', 'title']
movies_df.head( 10 )

Out[13]:

movieidtitle
01Toy Story (1995)
12GoldenEye (1995)
23Four Rooms (1995)
34Get Shorty (1995)
45Copycat (1995)
56Shanghai Triad (Yao a yao yao dao waipo qiao) …
67Twelve Monkeys (1995)
78Babe (1995)
89Dead Man Walking (1995)
910Richard III (1995)

In [14]:

#유사한 유저 찾기
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [15]:

#피벗테이블만들기
user_movies_df = rating_df.pivot( index='userid', columns='movieid', values = "rating" ).reset_index(drop=True)

In [16]:

#평점 없는 데이터 0으로 채우기
user_movies_df.fillna( 0, inplace = True )

In [17]:

user_movies_df.shape

Out[17]:

(943, 1682)

In [18]:

#데이터 상태확인
user_movies_df.iloc[10:20, 20:30]

Out[18]:

movieid21222324252627282930
100.04.00.03.03.00.00.05.03.00.0
110.00.00.00.00.00.00.05.00.00.0
123.04.05.01.01.00.03.05.02.00.0
130.03.05.00.02.00.00.00.00.00.0
140.00.00.00.03.00.00.00.00.00.0
150.05.00.00.00.00.02.05.00.00.0
160.00.00.00.00.00.00.00.00.00.0
170.05.04.00.03.04.00.03.00.00.0
180.00.00.00.00.00.00.00.00.00.0
190.05.00.00.00.00.00.00.00.00.0

In [19]:

#부여한 평점을 바탕으로 거리계산 
#유클리드, 코사인, 피어슨 상관이 가장 널리 사용되는 유사도 계수
#여기서는 코사인 거리 사용해서 계산
user_sim = 1 - pairwise_distances( user_movies_df.as_matrix(), metric="cosine" )

In [20]:

user_sim_df = pd.DataFrame( user_sim )

In [21]:

#평점과 영화아이디에 따른 코사인 유사도 결과
#1에 가까울수록 유사함
user_sim_df[0:5]

Out[21]:

0123456789933934935936937938939940941942
01.0000000.1669310.0474600.0643580.3784750.4302390.4403670.3190720.0781380.3765440.3695270.1194820.2748760.1897050.1973260.1180950.3140720.1486170.1795080.398175
10.1669311.0000000.1105910.1781210.0729790.2458430.1073280.1033440.1610480.1598620.1569860.3079420.3587890.4240460.3198890.2285830.2267900.1614850.1722680.105798
20.0474600.1105911.0000000.3441510.0212450.0724150.0661370.0830600.0610400.0651510.0318750.0427530.1638290.0690380.1242450.0262710.1618900.1012430.1334160.026556
30.0643580.1781210.3441511.0000000.0318040.0680440.0912300.1880600.1012840.0608590.0521070.0367840.1331150.1934710.1460580.0301380.1968580.1520410.1700860.058752
40.3784750.0729790.0212450.0318041.0000000.2372860.3736000.2489300.0568470.2014270.3387940.0805800.0949240.0797790.1486070.0714590.2399550.1395950.1524970.313941

5 rows × 943 columns

In [22]:

#누구와 누구가 비슷할까?
#유저가 자기자신과 가장 유사한 것을 보여줌 
user_sim_df.idxmax(axis=1)[0:5]

Out[22]:

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [23]:

#따라서 자기와의 상관관계를 0으로 설정하는것이 필요
np.fill_diagonal( user_sim, 0 )

In [24]:

user_sim_df = pd.DataFrame( user_sim )

In [25]:

#0으로바꿔준 결과
user_sim_df[0:5]

Out[25]:

0123456789933934935936937938939940941942
00.0000000.1669310.0474600.0643580.3784750.4302390.4403670.3190720.0781380.3765440.3695270.1194820.2748760.1897050.1973260.1180950.3140720.1486170.1795080.398175
10.1669310.0000000.1105910.1781210.0729790.2458430.1073280.1033440.1610480.1598620.1569860.3079420.3587890.4240460.3198890.2285830.2267900.1614850.1722680.105798
20.0474600.1105910.0000000.3441510.0212450.0724150.0661370.0830600.0610400.0651510.0318750.0427530.1638290.0690380.1242450.0262710.1618900.1012430.1334160.026556
30.0643580.1781210.3441510.0000000.0318040.0680440.0912300.1880600.1012840.0608590.0521070.0367840.1331150.1934710.1460580.0301380.1968580.1520410.1700860.058752
40.3784750.0729790.0212450.0318040.0000000.2372860.3736000.2489300.0568470.2014270.3387940.0805800.0949240.0797790.1486070.0714590.2399550.1395950.1524970.313941

5 rows × 943 columns

In [26]:

#누가 누구랑 비슷할까?
#545유저와 757유저와 비슷함
#실제 유저 ID는 색인번호+1이됨
user_sim_df.idxmax(axis=1).sample( 10, random_state = 10 )

Out[26]:

544    756
309    246
448    893
628    537
284    413
572    693
225    866
567    311
75     176
726    496
dtype: int64

In [27]:

#유사한 유저가 실제로 비슷한 평점부여했는가 확인하기
def get_user_similar_movies( user1, user2 ):
  common_movies = rating_df[rating_df.userid == user1].merge(
      rating_df[rating_df.userid == user2],
      on = "movieid",
      how = "inner" )

  return common_movies.merge( movies_df, on = 'movieid' )

In [28]:

#310번유저와 247번유저 비교
get_user_similar_movies( 310, 247 )

Out[28]:

userid_xmovieidrating_xuserid_yrating_ytitle
031025832475Contact (1997)
131025752474Men in Black (1997)
231025152474Shall We Dance? (1996)
3310102252474Fast, Cheap & Out of Control (1997)
431022232473Star Trek: First Contact (1996)
531018142474Return of the Jedi (1983)
63105052475Star Wars (1977)

In [29]:

#아이템 유사성 찾기
#영화-사용자 피벗테이블 만들기
rating_mat = rating_df.pivot( index='movieid', columns='userid', values = "rating" ).reset_index(drop=True)

In [30]:

#평점없는 것들은 0으로 채우기
rating_mat.fillna( 0, inplace = True )

In [31]:

rating_mat.shape

Out[31]:

(1682, 943)

In [32]:

#행 : 영화
#열 : 사용자
#사용자가 영화 평점한 표
rating_mat.head( 10 )

Out[32]:

userid12345678910934935936937938939940941942943
05.04.00.00.04.04.00.00.00.04.02.03.04.00.04.00.00.05.00.00.0
13.00.00.00.03.00.00.00.00.00.04.00.00.00.00.00.00.00.00.05.0
24.00.00.00.00.00.00.00.00.00.00.00.04.00.00.00.00.00.00.00.0
33.00.00.00.00.00.05.00.00.04.05.00.00.00.00.00.02.00.00.00.0
43.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
55.00.00.00.00.00.00.00.05.00.00.00.05.00.00.00.00.00.00.00.0
64.00.00.00.00.02.05.03.04.04.00.00.04.00.04.00.04.04.00.00.0
71.00.00.00.00.04.05.00.00.00.00.00.00.00.00.00.05.00.00.00.0
85.00.00.00.00.04.05.00.00.04.00.01.04.05.03.05.03.00.00.03.0
93.02.00.00.00.00.04.00.00.00.00.00.00.00.00.00.00.00.00.00.0

10 rows × 943 columns

In [33]:

#유사성 계산
movie_sim = 1 - pairwise_distances( rating_mat.as_matrix(), metric="correlation" )

In [34]:

movie_sim.shape

Out[34]:

(1682, 1682)

In [35]:

movie_sim_df = pd.DataFrame( movie_sim )

In [36]:

movie_sim_df.head( 10 )

Out[36]:

01234567891672167316741675167616771678167916801681
01.0000000.2345950.1933620.2262130.1288400.0151130.3473540.2544900.2095020.1046550.018215-0.029676-0.029676-0.0296760.018215-0.029676-0.029676-0.0296760.0341790.034179
10.2345951.0000000.1906490.4090440.2407120.0300620.2200220.2060200.0778940.072906-0.012451-0.012451-0.012451-0.012451-0.012451-0.012451-0.012451-0.0124510.0714150.071415
20.1933620.1906491.0000000.2278490.1413680.0653470.2588550.0786360.1461810.079608-0.009764-0.009764-0.009764-0.0097640.023964-0.009764-0.009764-0.009764-0.0097640.091421
30.2262130.4090440.2278491.0000000.2372980.0218780.2954890.3528000.2299220.138220-0.016619-0.0166190.0889840.0889840.025622-0.016619-0.016619-0.0166190.0467430.067863
40.1288400.2407120.1413680.2372981.000000-0.0085940.2052890.1458660.142541-0.033746-0.009889-0.009889-0.009889-0.009889-0.009889-0.009889-0.009889-0.009889-0.0098890.088618
50.0151130.0300620.0653470.021878-0.0085941.0000000.0544150.0123300.0796190.166084-0.005159-0.005159-0.005159-0.005159-0.005159-0.005159-0.005159-0.005159-0.005159-0.005159
60.3473540.2200220.2588550.2954890.2052890.0544151.0000000.1906700.2865720.178505-0.0260360.039920-0.026036-0.0260360.039920-0.026036-0.026036-0.0260360.0399200.039920
70.2544900.2060200.0786360.3528000.1458660.0123300.1906701.0000000.2293310.152679-0.0172300.0756170.0570470.0570470.075617-0.017230-0.017230-0.0172300.075617-0.017230
80.2095020.0778940.1461810.2299220.1425410.0796190.2865720.2293311.0000000.158373-0.021125-0.0211250.0472730.0472730.064372-0.021125-0.021125-0.0211250.0472730.064372
90.1046550.0729060.0796080.138220-0.0337460.1660840.1785050.1526790.1583731.000000-0.010138-0.0101380.0739670.073967-0.010138-0.010138-0.010138-0.010138-0.010138-0.010138

10 rows × 1682 columns

In [37]:

#토이스토리를 기준으로 영화별 유사성 표시
#토이스토리와 얼마나 비슷한 영화인지 모든 영화에 대해 알려줌 
movies_df['similarity'] = movie_sim_df.iloc[0]
movies_df.columns = ['movieid', 'title', 'similarity']
movies_df.head( 10 )

Out[37]:

movieidtitlesimilarity
01Toy Story (1995)1.000000
12GoldenEye (1995)0.234595
23Four Rooms (1995)0.193362
34Get Shorty (1995)0.226213
45Copycat (1995)0.128840
56Shanghai Triad (Yao a yao yao dao waipo qiao) …0.015113
67Twelve Monkeys (1995)0.347354
78Babe (1995)0.254490
89Dead Man Walking (1995)0.209502
910Richard III (1995)0.104655

In [43]:

#Four Rooms를 기준으로 영화별 유사성 표시
movies_df['similarity'] = movie_sim_df.iloc[2]
movies_df.columns = ['movieid', 'title', 'similarity']
movies_df.head( 10 )

Out[43]:

movieidtitlesimilarity
01Toy Story (1995)0.193362
12GoldenEye (1995)0.190649
23Four Rooms (1995)1.000000
34Get Shorty (1995)0.227849
45Copycat (1995)0.141368
56Shanghai Triad (Yao a yao yao dao waipo qiao) …0.065347
67Twelve Monkeys (1995)0.258855
78Babe (1995)0.078636
89Dead Man Walking (1995)0.146181
910Richard III (1995)0.079608

In [38]:

#유사한 영화 찾아주는 기능 
#id number를 입력하면 유사한 순서대로 찾아줌 
def get_similar_movies( movieid, topN = 5 ):
  movies_df['similarity'] = movie_sim_df.iloc[movieid -1]
  top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]
  print( "Similar Movies to: ", )
  return top_n

In [39]:

get_similar_movies( 228 )
Similar Movies to: 

Out[39]:

movieidtitlesimilarity
227228Star Trek: The Wrath of Khan (1982)1.000000
228229Star Trek III: The Search for Spock (1984)0.747498
229230Star Trek IV: The Voyage Home (1986)0.723112
226227Star Trek VI: The Undiscovered Country (1991)0.685605
175176Aliens (1986)0.590461
error: Content is protected !!
Scroll to Top