250x250
Notice
Recent Posts
Recent Comments
Link
코딩걸음마
[추천 시스템(RS)] DeepFM Frame 본문
728x90
CTR : user가 추천된 항목을 click할 확률을 예측하는 문제
추가 Feature engeneerig이 필요없다는 것이 Wide & Deep과 차이점이 있다
Explicit / implicit 한 데이터를 모두 모델링하려고 하는 모델이다
!pip install torchfm
import numpy as np
import torch
import torch.nn.functional as F
class 설정
class FeaturesLinear(torch.nn.Module):
def __init__(self, field_dims, output_dim=1):
super().__init__()
self.fc = torch.nn.Embedding(sum(field_dims), output_dim)
self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))
self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
def forward(self, x):
"""
:param x: Long tensor of size ``(batch_size, num_fields)``
"""
x = x + x.new_tensor(self.offsets).unsqueeze(0)
return torch.sum(self.fc(x), dim=1) + self.bias
class FeaturesEmbedding(torch.nn.Module):
def __init__(self, field_dims, embed_dim):
super().__init__()
self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
torch.nn.init.xavier_uniform_(self.embedding.weight.data)
def forward(self, x):
"""
:param x: Long tensor of size ``(batch_size, num_fields)``
"""
x = x + x.new_tensor(self.offsets).unsqueeze(0)
return self.embedding(x)
class FactorizationMachine(torch.nn.Module):
def __init__(self, reduce_sum=True):
super().__init__()
self.reduce_sum = reduce_sum
def forward(self, x):
"""
:param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
"""
square_of_sum = torch.sum(x, dim=1) ** 2
sum_of_square = torch.sum(x ** 2, dim=1)
ix = square_of_sum - sum_of_square
if self.reduce_sum:
ix = torch.sum(ix, dim=1, keepdim=True)
return 0.5 * ix
class MultiLayerPerceptron(torch.nn.Module):
def __init__(self, input_dim, embed_dims, dropout, output_layer=True):
super().__init__()
layers = list()
for embed_dim in embed_dims:
layers.append(torch.nn.Linear(input_dim, embed_dim))
layers.append(torch.nn.BatchNorm1d(embed_dim))
layers.append(torch.nn.ReLU())
layers.append(torch.nn.Dropout(p=dropout))
input_dim = embed_dim
if output_layer:
layers.append(torch.nn.Linear(input_dim, 1))
self.mlp = torch.nn.Sequential(*layers)
def forward(self, x):
"""
:param x: Float tensor of size ``(batch_size, embed_dim)``
"""
return self.mlp(x)
class DeepFactorizationMachineModel(torch.nn.Module):
"""
A pytorch implementation of DeepFM.
Reference:
H Guo, et al. DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, 2017.
"""
def __init__(self, field_dims, embed_dim, mlp_dims, dropout):
super().__init__()
self.linear = FeaturesLinear(field_dims)
self.fm = FactorizationMachine(reduce_sum=True)
self.embedding = FeaturesEmbedding(field_dims, embed_dim)
self.embed_output_dim = len(field_dims) * embed_dim
self.mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_dims, dropout)
def forward(self, x):
"""
:param x: Long tensor of size ``(batch_size, num_fields)``
"""
embed_x = self.embedding(x)
x = self.linear(x) + self.fm(embed_x) + self.mlp(embed_x.view(-1, self.embed_output_dim))
return torch.sigmoid(x.squeeze(1))
Load dataset and Train model
data_path = '파일 경로'
import torch.utils.data
class KMRDDataset(torch.utils.data.Dataset):
def __init__(self, data_path):
data = pd.read_csv(os.path.join(data_path,'rates.csv'))[:10000]
user_to_index = {original: idx for idx, original in enumerate(data.user.unique())}
movie_to_index = {original: idx for idx, original in enumerate(data.movie.unique())}
data['user'] = data['user'].apply(lambda x: user_to_index[x])
data['movie'] = data['movie'].apply(lambda x: movie_to_index[x])
# [user, movie, rate] -> (user, movie, rate)
data = data.to_numpy()[:, :3]
self.items = data[:, :2].astype(np.int) # -1 because ID begins from 1
self.targets = self.__preprocess_target(data[:, 2]).astype(np.float32)
self.field_dims = np.max(self.items, axis=0) + 1
self.user_field_idx = np.array((0, ), dtype=np.long)
self.item_field_idx = np.array((1,), dtype=np.long)
def __len__(self):
return self.targets.shape[0]
def __getitem__(self, index):
return self.items[index], self.targets[index]
def __preprocess_target(self, target):
target[target <= 9] = 0
target[target > 9] = 1
return target
import pandas as pd
import os
dataset = KMRDDataset(data_path=data_path)
print(dataset.item_field_idx)
print(dataset.field_dims)
print(sum(dataset.field_dims))
print(torch.nn.Embedding(sum(dataset.field_dims), 16))
print(torch.nn.Parameter(torch.zeros((1,))))
print(np.array((0, *np.cumsum(dataset.field_dims)[:-1]), dtype=np.long))
train_length = int(len(dataset) * 0.8)
valid_length = int(len(dataset) * 0.1)
test_length = len(dataset) - train_length - valid_length
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
dataset, (train_length, valid_length, test_length))
from torch.utils.data import DataLoader
train_data_loader = DataLoader(train_dataset, batch_size=16)
valid_data_loader = DataLoader(valid_dataset, batch_size=16)
test_data_loader = DataLoader(test_dataset, batch_size=1)
print(dataset.items)
print(dataset.targets)
model = DeepFactorizationMachineModel(dataset.field_dims, embed_dim=16, mlp_dims=(16, 16), dropout=0.2)
model
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001, weight_decay=1e-6)
import tqdm
log_interval = 100
model.train()
total_loss = 0
tk0 = tqdm.tqdm(train_data_loader, smoothing=0, mininterval=1.0)
for i, (fields, target) in enumerate(tk0):
# fields, target = fields.to(device), target.to(device)
y = model(fields)
loss = criterion(y, target.float())
model.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
if (i + 1) % log_interval == 0:
tk0.set_postfix(loss=total_loss / log_interval)
total_loss = 0
728x90
'딥러닝 템플릿 > 추천시스템(RS) 코드' 카테고리의 다른 글
[추천 시스템(RS)] AutoEncoder Meet Collaborative Filtering (0) | 2022.07.21 |
---|---|
[추천 시스템(RS)] Wide & Deep Learning for Recommender System (0) | 2022.07.20 |
[추천 시스템(RS)] Factorization Machine (0) | 2022.07.19 |
[추천 시스템(RS)] Neural Collaborative Filtering (0) | 2022.07.19 |
[추천 시스템(RS)] Matrix Factorization (0) | 2022.07.19 |
Comments