# !pip install -q scikit-surprise

del some_data_frame

import pandas as pd
import gc
from datetime import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os
import sqlite3
from scipy.sparse import csr_matrix
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader

import warnings
from surprise import accuracy
from surprise.model_selection import KFold
import pickle
import io
import random
import gc

os.chdir(r"D:\MyDrive2\pythonprojects\class2\DataMining\homework\final_project")

df_book = pd.read_csv(r"E:\PythonProjects\datamining\final_project\data\books.csv")

df_book = df_book.rename(columns={
    df_book.columns[0]: "user_id",
    df_book.columns[1]: "book_id",
    df_book.columns[2]: "rating",
    df_book.columns[3]: "date"
})

df_book['date'] = pd.to_datetime(df_book['date'], unit='s')

df_book.head()

df_book.iloc[0, :]

user_id             0001713353
book_id         A1REUF3A1YCPHM
rating                     5.0
date       2005-03-30 00:00:00
Name: 0, dtype: object

df_book.dtypes, df_book.shape

(user_id            object
 book_id            object
 rating            float64
 date       datetime64[ns]
 dtype: object,
 (51311620, 4))

# import gzip
# import shutil
#
# # Path to your .gz file
# input_path = r"E:\PythonProjects\datamining\final_project\data\Books_5.json.gz"
# # Path for the output file (remove .gz extension)
# output_path = r"E:\PythonProjects\datamining\final_project\data\Books_5.json"
#
# with gzip.open(input_path, 'rb') as f_in:
#     with open(output_path, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

# df_rating = pd.read_json(r"E:\PythonProjects\datamining\final_project\data\Books_5.json.gz", compression='gzip',lines=True)
# df_rating = pd.read_json(r"E:\PythonProjects\datamining\final_project\data\Books_5.json", lines=True)

# import json
# import pprint
#
# with open(r"E:\PythonProjects\datamining\final_project\data\Books_5.json", 'r', encoding="utf-8",errors="ignore") as f:
#     lines = f.readlines()
#     print(f"Total lines: {len(lines)}")
#     for line in lines:
#         print(line)
#         jline = json.loads(line)
#         pprint.pprint(jline)
#         break

# chunks = []
# chunksize = 100000  # Adjust based on your available memory
# for chunk in pd.read_json(r"E:\PythonProjects\datamining\final_project\data\Books_5.json",
#                          lines=True,
#                          chunksize=chunksize):
#     chunks.append(chunk)
#
# df_rating = pd.concat(chunks, ignore_index=True)

# import numpy as np
# from scipy.sparse import csr_matrix
# import pandas as pd
#
# # Read the JSON data in chunks to handle large file size
# chunks = []
# chunksize = 100000
# for chunk in pd.read_json(r"E:\PythonProjects\datamining\final_project\data\Books_5.json",
#                          lines=True,
#                          chunksize=chunksize):
#     chunks.append(chunk)
#
# # Combine all chunks into one dataframe
# df_rating = pd.concat(chunks, ignore_index=True)
#
# # Create user and item mappings to convert IDs to indices
# unique_users = df_rating['reviewerID'].unique()
# unique_books = df_rating['asin'].unique()
# user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
# book_to_idx = {book: idx for idx, book in enumerate(unique_books)}
#
# # Convert user and item IDs to indices
# user_indices = [user_to_idx[user] for user in df_rating['reviewerID']]
# book_indices = [book_to_idx[book] for book in df_rating['asin']]
# ratings = df_rating['overall'].values
#
# # Create the sparse CSR matrix
# rating_matrix = csr_matrix((ratings, (user_indices, book_indices)),
#                           shape=(len(unique_users), len(unique_books)))

def Reservoir_Sampling(infile_name, outfile_name, k):
    # your code

# Reset IPython history
# %reset -f


def Reservoir_Sampling(infile_name, outfile_name, k):
    # Increase default buffer size for better performance with large files
    buffer_size = 8 * 1024 * 1024  # 8MB buffer

    with io.open(infile_name, 'rb', buffering=buffer_size) as infile, \
            io.open(outfile_name, 'wb', buffering=buffer_size) as outfile:

        KEEP = []  # Reservoir storage
        n = 0  # Counter for total lines processed

        # Process input file line by line
        for line in infile:
            n += 1

            if n <= k:
                # Fill reservoir until k elements
                KEEP.append(line)
            else:
                # For nth element, pick it with probability k/n
                j = random.randint(0, n - 1)
                if j < k:
                    KEEP[j] = line

        # Write all kept lines to output file
        outfile.writelines(KEEP)

# DO NOT MODIFY THIS CELL
Reservoir_Sampling(r'E:\PythonProjects\datamining\final_project\data\Books.csv',
                   r'E:\PythonProjects\datamining\final_project\data\books10m.csv', 10000000)

# Save a even more smaller file books10k.csv for quick test
Reservoir_Sampling(r'E:\PythonProjects\datamining\final_project\data\Books.csv',
                   r'E:\PythonProjects\datamining\final_project\data\books10k.csv', 10000)

df_book = pd.read_csv(r"E:\PythonProjects\datamining\final_project\data\books10m.csv", usecols=[0, 1, 2, 3])
df_book = df_book.rename(columns={
    df_book.columns[0]: "user_id",
    df_book.columns[1]: "book_id",
    df_book.columns[2]: "rating",
    df_book.columns[3]: "date"
})
df_book['date'] = pd.to_datetime(df_book['date'], unit='s')
df_book.shape

(9999999, 4)

df_book

# Your code
total_rating_number = len(df_book)
total_user_number = len(df_book.user_id.unique())
total_book_number = len(df_book.book_id.unique())
global_average_rating = df_book.rating.mean()
user_average_rating = df_book.groupby('user_id').mean('rating')
book_average_rating = df_book.groupby('book_id').mean('rating')
matrix_density = total_rating_number / (total_user_number * total_book_number)

print(f"Total Rating Number: {total_rating_number}")
print(f"Total User Number: {total_user_number}")
print(f"Total Book Number: {total_book_number}")
print(f"Global Average Rating: {global_average_rating:.2f}")
print(f"Matrix Density: {matrix_density:.4%}")

Total Rating Number: 9999999
Total User Number: 1588262
Total Book Number: 5345280
Global Average Rating: 4.39
Matrix Density: 0.0001%

df_book.rating.value_counts()

rating
5.0    6620794
4.0    1863196
3.0     747807
1.0     406772
2.0     361429
0.0          1
Name: count, dtype: int64

df_book.date.describe()

count                          9999999
mean     2014-04-29 17:11:00.764587264
min                1996-05-31 00:00:00
25%                2013-05-19 00:00:00
50%                2015-02-12 00:00:00
75%                2016-08-25 00:00:00
max                2018-10-01 00:00:00
Name: date, dtype: object

df_book['year'] = df_book['date'].dt.year
years = sorted(df_book['year'].unique())

plt.figure(figsize=(18, 5))
for idx, year in enumerate(years):
    year_data = df_book[df_book['year'] == year]['rating']

    # Create KDE plot for each year with offset
    sns.kdeplot(data=year_data,
                fill=False,
                alpha=0.5,
                linewidth=1.5,
                bw_adjust=1,
                label=f'Year {year}')
plt.legend()

<matplotlib.legend.Legend at 0x1d73204b0b0>

# filter by time
df_book = df_book[df_book['date'] > '2010-01-01 00:00:00']
df_book.shape

(8959639, 5)

#filter by rate count
df_book_cnt = df_book.groupby('book_id')['rating'].count()
df_book_cnt = df_book_cnt[df_book_cnt > 2]
df_book_cnt.shape

(644976,)

#filter by user rate count
df_user_cnt = df_book.groupby('user_id')['rating'].count()
df_user_cnt = df_user_cnt.sort_values(ascending=False)[:int(len(df_user_cnt) * 0.2)]
df_user_cnt.shape

(280940,)

def trim(book_data_frame: pd.DataFrame) -> pd.DataFrame:
    # filter by time
    # df_book.iloc[0,:]['date'].timestamp()
    # dt = datetime.strptime('2010-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')
    # timestamp = int(dt.timestamp())
    # print(timestamp)
    # df_date = book_data_frame[book_data_frame['date'].apply(lambda x: x.timestamp()) > timestamp]
    df_date = book_data_frame[book_data_frame['date'] > '2010-01-01 00:00:00']

    #filter by rate count
    df_book_cnt = df_date.groupby('book_id')['rating'].count()
    df_book_cnt = df_book_cnt[df_book_cnt > 2]

    #filter by user rate count
    df_user_cnt = df_date.groupby('user_id')['rating'].count()
    df_user_cnt = df_user_cnt.sort_values(ascending=False)[:int(len(df_user_cnt) * 0.2)]

    result = book_data_frame[book_data_frame['user_id'].isin(df_user_cnt.index)]
    result = result[result['book_id'].isin(df_book_cnt.index)]
    return result

df_book = trim(df_book)

total_rating_number = len(df_book)
total_user_number = len(df_book.user_id.unique())
total_book_number = len(df_book.book_id.unique())
global_average_rating = df_book.rating.mean()
user_average_rating = df_book.groupby('user_id').mean('rating')
book_average_rating = df_book.groupby('book_id').mean('rating')
matrix_density = total_rating_number / (total_user_number * total_book_number)

print(f"Total Rating Number: {total_rating_number}")
print(f"Total User Number: {total_user_number}")
print(f"Total Book Number: {total_book_number}")
print(f"Global Average Rating: {global_average_rating:.2f}")
print(f"Matrix Density: {matrix_density:.4%}")

Total Rating Number: 3288353
Total User Number: 268132
Total Book Number: 629094
Global Average Rating: 4.39
Matrix Density: 0.0019%

# Save the checkpoint
# Comment to avoid overwriting
# pickle.dump(df_book, open(r"E:\PythonProjects\datamining\final_project\data\book_trimed.pkl", "wb"))

SVD().lr_bu

0.005

# Your code to prepare data


# Load the dataset
with open(r"E:\PythonProjects\datamining\final_project\data\book_trimed.pkl", 'rb') as file:
    data = pickle.load(file)
data.shape

(3290537, 5)

data.rating.value_counts()

rating
5.0    2065271
4.0     739742
3.0     288198
2.0     112775
1.0      84550
0.0          1
Name: count, dtype: int64

data = data[data['rating'] > 0]

reader = Reader(rating_scale=(1, 5))  # Adjust rating scale as needed
data_set = Dataset.load_from_df(data[['user_id', 'book_id', 'rating']], reader)

# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SVD()
for trainset, testset in kf.split(data_set):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.8878
RMSE: 0.8886
RMSE: 0.8900

# your code `n_factors=50, and 150)`, and **combined** with `lr_all=.01 and 0.1`;  `reg_all=.01, 0.1`
import time

lr_all = [0.01, 0.1]
reg_all = [0.01, 0.1]
n_factors = [50, 150]


def format_time(seconds):
    minutes = seconds // 60
    seconds = seconds % 60
    return f"{int(minutes)}m {seconds:.2f}s"


df_rmse = pd.DataFrame(columns=['lr_all', 'reg_all', 'n_factor', 'time', 'rmse'])

for lr in lr_all:
    for reg in reg_all:
        for n_factor in n_factors:
            algo = SVD(n_factors=n_factor, lr_all=lr, reg_all=reg)

            rmses = []
            start_time = time.time()
            for trainset, testset in kf.split(data_set):
                # train and test algorithm.
                algo.fit(trainset)
                predictions = algo.test(testset)

                # Compute and print Root Mean Squared Error
                print(f"lr_all:{lr}, reg_all:{reg}, n_factor:{n_factor}")
                rmse = accuracy.rmse(predictions, verbose=True)
                print(f"RMSE: {rmse:.4f}")
                rmses.append(rmse)

            end_time = time.time()
            fold_time = end_time - start_time
            mean_rmse = np.mean(rmses)
            row = pd.DataFrame(
                {'lr_all': [lr], 'reg_all': [reg], 'n_factor': [n_factor], 'time': [fold_time], 'rmse': [mean_rmse]})
            df_rmse = pd.concat([df_rmse, row], ignore_index=True)

lr_all:0.01, reg_all:0.01, n_factor:50
RMSE: 0.8838
RMSE: 0.8838
lr_all:0.01, reg_all:0.01, n_factor:50
RMSE: 0.8840
RMSE: 0.8840
lr_all:0.01, reg_all:0.01, n_factor:50
RMSE: 0.8858
RMSE: 0.8858

C:\Users\tropi\AppData\Local\Temp\ipykernel_31144\2159502691.py:40: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
  df_rmse = pd.concat([df_rmse, row], ignore_index=True)

lr_all:0.01, reg_all:0.01, n_factor:150
RMSE: 0.8925
RMSE: 0.8925
lr_all:0.01, reg_all:0.01, n_factor:150
RMSE: 0.8924
RMSE: 0.8924
lr_all:0.01, reg_all:0.01, n_factor:150
RMSE: 0.8931
RMSE: 0.8931
lr_all:0.01, reg_all:0.1, n_factor:50
RMSE: 0.8755
RMSE: 0.8755
lr_all:0.01, reg_all:0.1, n_factor:50
RMSE: 0.8767
RMSE: 0.8767
lr_all:0.01, reg_all:0.1, n_factor:50
RMSE: 0.8773
RMSE: 0.8773
lr_all:0.01, reg_all:0.1, n_factor:150
RMSE: 0.8803
RMSE: 0.8803
lr_all:0.01, reg_all:0.1, n_factor:150
RMSE: 0.8810
RMSE: 0.8810
lr_all:0.01, reg_all:0.1, n_factor:150
RMSE: 0.8807
RMSE: 0.8807
lr_all:0.1, reg_all:0.01, n_factor:50
RMSE: 0.8982
RMSE: 0.8982
lr_all:0.1, reg_all:0.01, n_factor:50
RMSE: 0.8988
RMSE: 0.8988
lr_all:0.1, reg_all:0.01, n_factor:50
RMSE: 0.8996
RMSE: 0.8996
lr_all:0.1, reg_all:0.01, n_factor:150
RMSE: 0.8861
RMSE: 0.8861
lr_all:0.1, reg_all:0.01, n_factor:150
RMSE: 0.8859
RMSE: 0.8859
lr_all:0.1, reg_all:0.01, n_factor:150
RMSE: 0.8854
RMSE: 0.8854
lr_all:0.1, reg_all:0.1, n_factor:50
RMSE: 0.8922
RMSE: 0.8922
lr_all:0.1, reg_all:0.1, n_factor:50
RMSE: 0.8922
RMSE: 0.8922
lr_all:0.1, reg_all:0.1, n_factor:50
RMSE: 0.8927
RMSE: 0.8927
lr_all:0.1, reg_all:0.1, n_factor:150
RMSE: 0.8827
RMSE: 0.8827
lr_all:0.1, reg_all:0.1, n_factor:150
RMSE: 0.8824
RMSE: 0.8824
lr_all:0.1, reg_all:0.1, n_factor:150
RMSE: 0.8825
RMSE: 0.8825

# I stored the result in a sqlite db

df_rmse = pd.read_sql("SELECT * FROM rmse", sqlite3.connect("usr_rcs.sqlite"))

df_rmse

import plotly.express as px
import plotly.io as pio

pio.renderers.default = "browser"

# Create interactive 3D scatter plot
fig = px.scatter_3d(
    df_rmse,
    x='reg_all',
    y='n_factor',
    z='rmse',
    color='lr_all',
    title='rmse vs reg_all vs n_factor',
    labels={
        'reg_all': 'reg_all',
        'n_factor': 'n_factor',
        'rmse': 'rmse',
        'lr_all': 'Learning Rate'
    },
    color_discrete_sequence=px.colors.qualitative.Set1,  # Use a discrete color palette
    hover_data={
        'rmse': ':.4f',  # Format RMSE to 4 decimal places
        'reg_all': True,
        'n_factor': True
    },
    # Custom hover template
    hover_name=None,
    opacity=0.6

)

# Update layout for better visualization
fig.update_layout(
    scene=dict(
        xaxis_title='reg_all',
        yaxis_title='n_factor',
        zaxis_title='rmse'
    ),
    width=900,
    height=700,
    margin=dict(l=0, r=0, b=0, t=30)
)

fig.update_traces(
    hovertemplate="<br>".join([
        "RMSE: %{z:.4f}",
        "Regularization: %{x}",
        "n_factor: %{y}",
    ])
)

# Show the plot
fig.show()

algo = KNNBasic()

def get_top_10_book(uid: str = "0373244045"):
    book_list = data.book_id.unique()
    # The book this user commented
    user_book = data[data['user_id'] == uid]['book_id'].unique()
    # The book not commented by this user
    user_not_book = [i for i in book_list if not i in user_book]
    # Predict the rating for the books note rated by this user
    user_not_book_rate = [algo.predict(uid, i)[3] for i in user_not_book]
    # Sort the book index by rating in descending order, get the top 10
    top_n_book = np.argsort(user_not_book_rate)[::-1][:10]
    # Get the book names by sorted index top 10
    selected_books = [user_not_book[i] for i in list(top_n_book)]
    return selected_books

get_top_10_book('1500336513')

['A287QJTE1HFZZ5',
 'A15Y0XJPEGJYTK',
 'A6R7UMU1J68XQ',
 'A2MOAH15GI79HF',
 'ATZESZ8SZOWDF',
 'APBOM5Z9PHPUH',
 'AIR269ZFK02P8',
 'A3HEPG2AS0YG6Z',
 'A5UT8WKIJILKA',
 'A1KF55C7XRPSC9']

data

from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25)

# train your copied model with our data

class MF():

    def __init__(self, R, K, alpha, beta, iterations):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.

        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """

        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1. / self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1. / self.K, size=(self.num_items, self.K))

        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]

        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i + 1) % 10 == 0:
                print("Iteration: %d ; error = %.4f" % (i + 1, mse))

        return training_process

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i, :])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j, :])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return self.b + self.b_u[:, np.newaxis] + self.b_i[np.newaxis:, ] + self.P.dot(self.Q.T)

R = np.array([
    [5, 3, 0, 1],
    [4, 0, 0, 1],
    [1, 1, 0, 5],
    [1, 0, 0, 4],
    [0, 1, 5, 4],
])

mf = MF(R, K=2, alpha=0.1, beta=0.01, iterations=20)

mf.train()
mf.get_rating(2, 3)
mf.mse()
mf.full_matrix()

Iteration: 10 ; error = 0.3487
Iteration: 20 ; error = 0.0627

array([[4.97741296, 3.01432575, 2.15464326, 1.00159637],
       [4.00816112, 2.56530194, 1.91469885, 1.01899245],
       [1.01628937, 0.98161098, 5.84116035, 4.98098285],
       [0.99735811, 1.03013045, 4.63620617, 3.9806118 ],
       [1.47363508, 1.02646674, 4.97501264, 4.01042782]])

def count_lines(file_path):
    with open(file_path, 'r') as f:
        count = sum(1 for _ in f)
    return count


def print_first_lines(file_path):
    with open(file_path, 'r') as f:
        print(f.readline())


# Use the function
file_path = r"E:\PythonProjects\datamining\final_project\data\books10m.csv"
print_first_lines(file_path)
total_lines = count_lines(file_path)
print(f"Total number of lines: {total_lines}")

0001713353,A1C6M8LCIX4M6M,5.0,1123804800

Total number of lines: 10000000

import numpy as np
from scipy.sparse import csr_matrix
import csv


def create_csr_from_csv(file_path, chunk_size=1000000):
    # Dictionaries to map user/book IDs to indices
    user_to_idx = {}
    book_to_idx = {}
    current_user_idx = 0
    current_book_idx = 0

    # Lists to store data for CSR matrix
    rows = []
    cols = []
    data = []

    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        # next(reader)  # Skip header if exists

        chunk = []
        for i, line in enumerate(reader):
            user_id, book_id, rating = line[:3]  # Take first 3 columns, 4th is date which is useless for us

            rating = float(rating)

            if rating <= 0:
                continue

            # Map IDs to indices
            if user_id not in user_to_idx:
                user_to_idx[user_id] = current_user_idx
                current_user_idx += 1
            if book_id not in book_to_idx:
                book_to_idx[book_id] = current_book_idx
                current_book_idx += 1

            # Add to current chunk
            rows.append(user_to_idx[user_id])
            cols.append(book_to_idx[book_id])
            data.append(float(rating))

            # Process chunk if size reached
            if len(rows) % chunk_size == 0:
                print(f"Processed {i + 1} rows")

    # Create the final CSR matrix
    rating_matrix = csr_matrix(
        (data, (rows, cols)),
        shape=(len(user_to_idx), len(book_to_idx))
    )

    print(f"Final matrix shape: {rating_matrix.shape}")
    print(f"Number of non-zero entries: {rating_matrix.nnz}")
    print(f"Sparsity: {rating_matrix.nnz / (rating_matrix.shape[0] * rating_matrix.shape[1]):.4%}")

    return rating_matrix, user_to_idx, book_to_idx


# Use the function
file_path = r"E:\PythonProjects\datamining\final_project\data\books10m.csv"
rating_matrix, user_to_idx, book_to_idx = create_csr_from_csv(file_path)

Processed 1000000 rows
Processed 2000000 rows
Processed 3000000 rows
Processed 4000000 rows
Processed 5000000 rows
Processed 6000000 rows
Processed 7000000 rows
Processed 8000001 rows
Processed 9000001 rows
Final matrix shape: (1588262, 5345281)
Number of non-zero entries: 9988363
Sparsity: 0.0001%

# Saving the rating_matrix, user_to_idx, book_to_idx to file
rating_matrix_params = {
    'rating_matrix': rating_matrix,
    'user_to_idx': user_to_idx,
    'book_to_idx': book_to_idx
}
with open('rating_matrix_params.pkl', 'wb') as f:
    pickle.dump(rating_matrix_params, f)

# Loading the rating_matrix, user_to_idx, book_to_idx from file
with open('rating_matrix_params.pkl', 'rb') as f:
    rating_matrix_params = pickle.load(f)

# Access the variables
rating_matrix = rating_matrix_params['rating_matrix']
user_to_idx = rating_matrix_params['user_to_idx']
book_to_idx = rating_matrix_params['book_to_idx']

rating_matrix.data.mean()

4.3980990678852985

from tqdm import tqdm


def gen_samples(R):
    # Get nonzero indices
    i_indices, j_indices = R.nonzero()
    total = len(i_indices)

    # Create samples list with tqdm progress bar
    samples = [
        (i, j, R[i, j])
        for i, j in tqdm(zip(i_indices, j_indices),
                         total=total,
                         desc="Generating samples")
    ]
    return samples


samples = gen_samples(rating_matrix)

Generating samples: 100%|██████████| 9988363/9988363 [03:03<00:00, 54354.72it/s]

import torch

print(f"CUDA: {torch.cuda.is_available()}")


# pip3 install torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

CUDA: True

if ('mfcsr' in locals() or 'mfcsr' in globals()) and mfcsr is not None:
    del mfcsr
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.reset_max_memory_cached()

gc.collect()
# Check GPU memory usage
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

D:\PythonProjects\class2_venv2\.venv\Lib\site-packages\torch\cuda\memory.py:489: FutureWarning:

torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, which resets /all/ peak memory stats.

D:\PythonProjects\class2_venv2\.venv\Lib\site-packages\torch\cuda\memory.py:515: FutureWarning:

torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, which resets /all/ peak memory stats.

0
0

# First delete any existing tensors/models
if ('mfcsr' in locals() or 'mfcsr' in globals()) and mfcsr is not None:
    del mfcsr

# Clear all PyTorch GPU cached memory
if torch.cuda.is_available():
    # Delete all references to tensors and models
    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj):
                if obj.is_cuda:
                    del obj
            elif hasattr(obj, 'cuda'):
                if hasattr(obj, 'is_cuda'):
                    if obj.is_cuda:
                        del obj
        except Exception as e:
            pass

    # Empty cache
    torch.cuda.empty_cache()
    # Reset peak stats
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.reset_max_memory_cached()

# Force garbage collection multiple times
for _ in range(2):
    gc.collect()

# Check memory status
print(f"Allocated: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
print(f"Reserved: {torch.cuda.memory_reserved() / 1024 ** 2:.2f} MB")

D:\PythonProjects\class2_venv2\.venv\Lib\site-packages\torch\__init__.py:1117: FutureWarning:

`torch.distributed.reduce_op` is deprecated, please use `torch.distributed.ReduceOp` instead

C:\Users\tropi\AppData\Local\Temp\ipykernel_31144\2758481352.py:13: FutureWarning:

`torch.distributed.reduce_op` is deprecated, please use `torch.distributed.ReduceOp` instead

D:\PythonProjects\class2_venv2\.venv\Lib\site-packages\torch\cuda\memory.py:489: FutureWarning:

torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, which resets /all/ peak memory stats.

D:\PythonProjects\class2_venv2\.venv\Lib\site-packages\torch\cuda\memory.py:515: FutureWarning:

torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, which resets /all/ peak memory stats.

Allocated: 0.00 MB
Reserved: 0.00 MB

# from tqdm import tqdm
# import numpy as np
# import torch


class MF_csr_gpu():
    """
    Matrix Factorization class optimized for CSR matrix format with numerical stability improvements.

    Uses float64 precision and includes safeguards against overflow/underflow conditions.

    Parameters
    ----------
    R : scipy.sparse.csr_matrix
        User-item rating matrix in CSR format
    K : int
        Number of latent dimensions
    alpha : float
        Learning rate (between 0 and 1)
    beta : float
        Regularization parameter
    iterations : int
        Number of iterations for training
    samples : list
        Pre-generated training samples as (user, item, rating) tuples

    Attributes
    ----------
    P : ndarray
        User latent feature matrix
    Q : ndarray
        Item latent feature matrix
    b_u : ndarray
        User biases
    b_i : ndarray
        Item biases
    b : float
        Global bias
    """

    def __init__(self, R, K, alpha, beta, iterations, samples):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = np.float64(alpha)
        self.beta = np.float64(beta)
        self.iterations = iterations
        self.samples = samples

        # Numerical bounds for stability
        self.MIN_RATING = 1.0
        self.MAX_RATING = 5.0
        self.EPSILON = np.finfo(np.float64).eps

        # Check if CUDA is available
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(self.device)
        torch.cuda.empty_cache()

    def train(self):
        # Initialize matrices with float64 dtype
        rng = np.random.default_rng()
        init_scale = np.sqrt(1.0 / self.K)  # More stable initialization

        self.P = rng.normal(0, init_scale, size=(self.num_users, self.K)).astype(np.float64)
        self.Q = rng.normal(0, init_scale, size=(self.num_items, self.K)).astype(np.float64)

        # Initialize biases with float64
        self.b_u = np.zeros(self.num_users, dtype=np.float64)
        self.b_i = np.zeros(self.num_items, dtype=np.float64)
        self.b = np.float64(self.R.data.mean())

        print(f"Number of training samples: {len(self.samples)}")

        # Move data to GPU
        self.b_u = torch.tensor(self.b_u, dtype=torch.float64, device="cuda")
        self.b_i = torch.tensor(self.b_i, dtype=torch.float64, device="cuda")
        self.P = torch.tensor(self.P, dtype=torch.float64, device="cuda")
        self.Q = torch.tensor(self.Q, dtype=torch.float64, device="cuda")
        self.b = torch.tensor(self.b, dtype=torch.float64, device="cuda")
        training_process = []
        for i in range(self.iterations):
            # print(f"shuffling")
            rng.shuffle(self.samples)
            # print(f"Iteration: {i+1}")
            # print("sgd")
            self.sgd()
            # print("mse")
            mse = self.mse_gpu()
            training_process.append((i, mse))
            print("Iteration: %d ; error = %.4f" % (i + 1, mse))

        return training_process

    def mse_gpu(self):
        # xs, ys = self.R.nonzero()

        squared_error = 0.0
        n_ratings = len(self.samples)

        batch_size = 2000000
        for i in range(0, n_ratings, batch_size):
            # print(f"mse,i:{i}")
            samples = torch.tensor(self.samples[i:i + batch_size], dtype=torch.float64,
                                   device="cuda")
            batch_xs = samples[:, 0]
            batch_ys = samples[:, 1]
            all_acctual_ratings = samples[:, 2]

            P_batch = self.P[batch_xs.long()]
            Q_batch = self.Q[batch_ys.long()]

            # Compute dot products for the batch
            dot_products = torch.sum(P_batch * Q_batch, dim=1)
            predictions = self.b + self.b_u[batch_xs.long()] + self.b_i[batch_ys.long()] + dot_products
            # predictions_gpu = torch.clamp(predictions, self.MIN_RATING, self.MAX_RATING)

            # Calculate squared errors for this batch
            batch_errors = all_acctual_ratings - predictions
            batch_squared_error = torch.sum(batch_errors ** 2).item()

            # Add to total squared error
            squared_error += batch_squared_error

        result = np.sqrt(squared_error / (n_ratings + self.EPSILON))

        return result

    def sgd(self):
        """
        Perform stochastic gradient descent with numerical stability improvements.
        """
        # Move data to GPU
        batch_size = 1000000
        for i in range(0, len(self.samples), batch_size):
            # print(f"sgd,i:{i}")
            samples = torch.tensor(self.samples[i:i + batch_size], dtype=torch.float64,
                                   device="cuda")

            # Unpack i, j, r from samples
            i = samples[:, 0].long()  # Convert to long for indexing
            j = samples[:, 1].long()  # Convert to long for indexing
            r = samples[:, 2]  # Ratings

            # Predictions for all samples
            predictions = (
                    torch.sum(self.P[i] * self.Q[j], dim=1) + self.b_u[i] + self.b_i[j] + self.b
            )

            # Compute errors
            errors = r - predictions

            # Gradient updates for biases
            bu_grad = errors - self.beta * self.b_u[i]
            bi_grad = errors - self.beta * self.b_i[j]

            # Update user and item biases
            self.b_u[i] += self.alpha * torch.clamp(bu_grad, -1e3, 1e3)
            self.b_i[j] += self.alpha * torch.clamp(bi_grad, -1e3, 1e3)

            # Gradient updates for latent factors
            P_grad = errors.unsqueeze(1) * self.Q[j] - self.beta * self.P[i]
            Q_grad = errors.unsqueeze(1) * self.P[i] - self.beta * self.Q[j]

            # Update latent factors
            self.P.index_add_(0, i, self.alpha * torch.clamp(P_grad, -1e3, 1e3))
            self.Q.index_add_(0, j, self.alpha * torch.clamp(Q_grad, -1e3, 1e3))

rating_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9988363 stored elements and shape (1588262, 5345281)>

mfcsr = MF_csr_gpu(rating_matrix, K=50, alpha=0.001, beta=0.2, iterations=20, samples=samples)
training_process = mfcsr.train()

cuda
Number of training samples: 9988363
Iteration: 1 ; error = 1.0936
Iteration: 2 ; error = 1.0882
Iteration: 3 ; error = 1.0828
Iteration: 4 ; error = 1.0766
Iteration: 5 ; error = 1.0690
Iteration: 6 ; error = 1.0587
Iteration: 7 ; error = 1.0466
Iteration: 8 ; error = 1.0368
Iteration: 9 ; error = 1.0311
Iteration: 10 ; error = 1.0269
Iteration: 11 ; error = 1.0229
Iteration: 12 ; error = 1.0190
Iteration: 13 ; error = 1.0152
Iteration: 14 ; error = 1.0114
Iteration: 15 ; error = 1.0077
Iteration: 16 ; error = 1.0041
Iteration: 17 ; error = 1.0005
Iteration: 18 ; error = 0.9969
Iteration: 19 ; error = 0.9934
Iteration: 20 ; error = 0.9900

# Create CPU copies while keeping original CUDA tensors unchanged
b_u_cpu = mfcsr.b_u.cpu().clone()
b_i_cpu = mfcsr.b_i.cpu().clone()
P_cpu = mfcsr.P.cpu().clone()
Q_cpu = mfcsr.Q.cpu().clone()
b_cpu = mfcsr.b.cpu().clone()

# Create a dictionary with all the parameters
# Comment to void overwriting

# model_params = {
#     'b_u': b_u_cpu,
#     'b_i': b_i_cpu,
#     'P': P_cpu,
#     'Q': Q_cpu,
#     'b': b_cpu,
#     'training_process': training_process
# }
#
# # Save the parameters to a file
# with open('model_params.pkl', 'wb') as f:
#     pickle.dump(model_params, f)

with open('model_params.pkl', 'rb') as f:
    loaded_params = pickle.load(f)

# Access the variables
b_u_cpu = loaded_params['b_u']
b_i_cpu = loaded_params['b_i']
P_cpu = loaded_params['P']
Q_cpu = loaded_params['Q']
b_cpu = loaded_params['b']
training_process = loaded_params['training_process']

def predict_cpu(user_id: str, book_id: str):
    """
    Predict rating for given user and item with numerical stability checks.

    Parameters
    ----------
    user_id : str
        User ID
    book_id : str
        book ID

    Returns
    -------
    float
        Predicted rating clipped to valid range
    """

    MIN_RATING = 1.0
    MAX_RATING = 5.0

    uid = user_to_idx.get(user_id, None)
    iid = book_to_idx.get(book_id, None)

    if uid is None or iid is None:
        return -1

    dot_product = np.dot(P_cpu[uid, :], Q_cpu[iid, :])
    prediction = b_cpu + b_u_cpu[uid] + b_i_cpu[iid] + dot_product

    # Clip prediction to valid rating range
    return np.clip(prediction, MIN_RATING, MAX_RATING)

predict_cpu('1939811104', 'AWH493Q118CB7')

tensor(4.4173, dtype=torch.float64)

	user_id	book_id	rating	date
0	0804138656	A5SPCIGNLRUH9	5.0	2015-05-23
1	1477830138	A39CRGVP6S5IGR	5.0	2015-09-15
2	0932194540	A2C13HP7R5RIJ9	5.0	2010-12-21
3	038531972X	A34VFF6N5QMPUC	5.0	2018-01-11
4	0553807404	A79SSJPXFXUH3	5.0	2014-07-02
...	...	...	...	...
9999994	193590437X	A2ZIT1GZZR980D	5.0	2012-10-29
9999995	0800721225	AO6NHS4MBHSZ9	2.0	2013-01-20
9999996	1456337688	A2UTBYYF2SJFXN	5.0	2011-03-17
9999997	1975926773	ADQFI9RB4T1VL	5.0	2018-03-25
9999998	1593639589	A3MW3GO7LPQ39J	5.0	2014-07-22

	lr_all	reg_all	n_factor	time	rmse
0	0.01	0.01	50	111.410039	0.884048
1	0.01	0.01	150	135.617260	0.892240
2	0.01	0.10	50	109.017855	0.876004
3	0.01	0.10	150	138.573165	0.880057
4	0.10	0.01	50	109.442813	0.899201
5	0.10	0.01	150	138.581531	0.885470
6	0.10	0.10	50	110.087006	0.891641
7	0.10	0.10	150	137.073136	0.881697

	user_id	book_id	rating	date	year
5	0001713353	AVP0HXC9FG790	5.0	2016-06-20	2016
11	0001713353	A2RE7WG349NV5D	5.0	2015-07-09	2015
15	1451693494	A3518R1F46NWOM	5.0	2014-08-19	2014
21	1502830825	A1FW6L02VSCUYH	5.0	2015-01-04	2015
23	0001713353	A3M314LI9OYME	5.0	2013-10-08	2013
...	...	...	...	...	...
9999984	0553393235	A2L0GFTTI618RU	5.0	2013-10-05	2013
9999987	042526257X	AZTFBJTENKT1T	5.0	2015-11-24	2015
9999994	1478227133	A2USTMJDOQPOA2	3.0	2014-09-22	2014
9999995	1490914943	AGE28EEF9PE6U	5.0	2014-06-08	2014
9999998	1500165972	A1TNVUMSPY8BM1	4.0	2014-06-10	2014

Recommendation Systems for Amazon Books¶

Overview:¶

Task one: Data ingest (10pts)¶

Tried several ways to load the whole json file into memory.¶

Task Two: Reservoir Sampling (this involves some algorithms we have not covered, if feeling confused you can temporarily skip this step before the relevant lecture) (15pts)¶

Run code below after your function is done¶

Task Three: Data Exploration (15pts)¶

Task Four: Triming your data (15pts)¶

Task Five: fit the model using the data. (20pts)¶

Run code chunk below as is after data is loaded.¶

Copy code chunk above and use different parameters for `SVD` call, then rerun.¶

Visualization in chrome browser¶

Write down your observation and your explain what you see.¶

My observation: The matrix is sparce and maybe noisy, so lower n_factor and higher reg_all is benefacial for the lower RMSE.¶

Task Six: use the model for recommendation (15pts)¶

rec top 10 books for uid¶

Task Seven: build the SVD from scratch. (20pts, including 10pts as bonus)¶

Load data¶

Commencement of official operations¶

Load data into CSR matrix, getting the rate, user to index, book to index¶

Dump the rating index, user to index, book to index. For future reference¶

Convert the CSR rating matrix to list of tuples, so that could be easily elevate to GPU¶

Import torch and confirm the existence of calculation hardware¶

Drop the sub-tank and get ready to fight¶

Prototyping the battleship¶

Focusing on the target¶

Instantiation and wrangling with the target¶

Scavenge for loot¶

Reproducing the Intellectual achievements¶

The prophet appears¶

chant a spell¶

	user_id	book_id	rating	date
0	0001713353	A1REUF3A1YCPHM	5.0	2005-03-30
1	0001713353	A1YRBRK2XM5D5	5.0	2004-04-04
2	0001713353	A1V8ZR5P78P4ZU	5.0	2004-02-21
3	0001713353	A2ZB06582NXCIV	5.0	2016-10-03
4	0001713353	ACPQVNRD3Z09X	5.0	2016-07-29

Recommendation Systems for Amazon Books¶

Overview:¶

Task one: Data ingest (10pts)¶

Tried several ways to load the whole json file into memory.¶

Task Two: Reservoir Sampling (this involves some algorithms we have not covered, if feeling confused you can temporarily skip this step before the relevant lecture) (15pts)¶

Run code below after your function is done¶

Task Three: Data Exploration (15pts)¶

Task Four: Triming your data (15pts)¶

Task Five: fit the model using the data. (20pts)¶

Run code chunk below as is after data is loaded.¶

Copy code chunk above and use different parameters for SVD call, then rerun.¶

Visualization in chrome browser¶

Write down your observation and your explain what you see.¶

My observation: The matrix is sparce and maybe noisy, so lower n_factor and higher reg_all is benefacial for the lower RMSE.¶

Task Six: use the model for recommendation (15pts)¶

rec top 10 books for uid¶

Task Seven: build the SVD from scratch. (20pts, including 10pts as bonus)¶

Load data¶

Commencement of official operations¶

Load data into CSR matrix, getting the rate, user to index, book to index¶

Dump the rating index, user to index, book to index. For future reference¶

Convert the CSR rating matrix to list of tuples, so that could be easily elevate to GPU¶

Import torch and confirm the existence of calculation hardware¶

Drop the sub-tank and get ready to fight¶

Prototyping the battleship¶

Focusing on the target¶

Instantiation and wrangling with the target¶

Scavenge for loot¶

Reproducing the Intellectual achievements¶

The prophet appears¶

chant a spell¶

Copy code chunk above and use different parameters for `SVD` call, then rerun.¶