# library command use to load packages
library(ggplot2) # for plotting
library(scales) # for plotting
library(stats) # for statistics
library(broom) # for statistics
library(gridExtra) # for plotting
library(stringr) # for string manipulation
library(animation) # for animation
library(fpc) # for clustering
# set random seed to ensure reproducibility
set.seed(142857)
# set global ggplot styles
dev.new(width = 2880, height = 1080, unit = "px") # set plot size
theme_set(theme_bw()) # set theme
theme_update(plot.title = element_text(hjust = 0.5)) # set title alignment
palette = c("#B2182B", "#D6604D", "#F4A582", "#FDDBC7", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC") # set color palette
palette_alt = numeric(length(palette)) # create an array to store the alternative palette
# set alternative palette in for loop
for (i in 1:length(palette)) {if (i %% 2 == 1) {palette_alt[i] <- palette[(i + 1) %/% 2]}
else {palette_alt[i] <- palette[length(palette) - (i - 1) %/% 2]}}
gghistogram_style = function () {geom_histogram(bins = 60, alpha = 0.75)} # set histogram style
ggdensity_style = function () {geom_density(colour = "white", alpha = 0.75)} # set density style
ggscatter_style_def = function () {geom_point(size = 1.5, shape = 16, alpha = 0.75)} # set scatterplot style
ggscatter_style = function () {geom_point(colour = "#2166AC", size = 1.5, shape = 16, alpha = 0.75)} # set scatterplot style
gglabel_style = function () {geom_text(vjust = -0.5)} # set label style
ggqqplot_style = function () {geom_qq(colour = "#2166AC", size = 1.5, shape = 16, alpha = 0.75)} # set qqplot style
ggqqline_style = function () {geom_qq_line(linetype = "dashed", linewidth = 1.1, color = "#B2182B")} # set qqline style
gghline_style = function (x) {geom_hline(yintercept = x, linetype = "dashed", linewidth = 1.1, color = "#B2182B")} # set hline style
gginterval_confidence = function (model) { # set confidence interval style
geom_ribbon(aes(ymin = predict(model, interval = "confidence")[, 2],
ymax = predict(model, interval = "confidence")[, 3]),
color = "#B2182B", fill = "#B2182B", alpha = 0.2)
}
gginterval_prediction = function (model) { # set prediction interval style
geom_ribbon(aes(ymin = predict(model, interval = "prediction")[, 2],
ymax = predict(model, interval = "prediction")[, 3]),
color = "#FDDBC7", fill = "#FDDBC7", alpha = 0.2)
}
animate = function (dataframes, func) { # set animation function
saveGIF({
for (name in names(dataframes))
print(func(name, dataframes[[name]])) # call given func to plot
},
movie.name = paste(deparse(substitute(func)), ".gif"), # set gif name
interval = 0.1, ani.width = 600, ani.height = 400) # set gif parameters
}
all_path = "C:\\Users\\marcc\\My Drive\\Data Extraction\\all-crypto-daily" # set path
cat_path = "C:\\Users\\marcc\\My Drive\\Data Extraction\\categories-daily" # set path
all_file = list.files(all_path, pattern = "*.csv", full.names = TRUE) # load files
cat_file = list.files(cat_path, pattern = "artificial*", full.names = TRUE) # load files
get_date = function (string) { # define function to extract date from file name
regex = "([0-9]{4}-[0-9]{2}-[0-9]{2})" # set regex pattern (e.g. all pattern in this format 2018-01-01)
date = str_extract(string, regex) # extract date
return (date)
}
read_df = function (file_list) {
data_dict = list() # create a dictionary to store dataframes
for (file in file_list) { # loop through files
date = get_date(file) # extract date
df = read.csv(file) # read csv (aka comma separated values) file
df$MarketCap = as.numeric(gsub(",", "", df$MarketCap)) # remove commas
df$Volume24h = as.numeric(gsub(",", "", df$Volume24h)) # remove commas
df = df[!is.na(df$MarketCap), ] # remove NA
df = df[!is.na(df$Volume24h), ] # remove NA
df = df[!df$MarketCap == 0, ] # remove 0
df = df[!df$Volume24h == 0, ] # remove 0
date = substr(file, nchar(file) - 22, nchar(file) - 13) # extract date
df[["Ratio"]] = df$Volume24h / df$MarketCap # calculate ratio
data_dict[[date]] = df # store dataframe in dictionary
}
return (data_dict)
}
all_df = read_df(all_file) # read files
cat_df = read_df(cat_file) # read files
df = all_df[[1]] # set df to be the first dataframe of all-crypto-daily
join_df = function (all_df, cat_df) { # define function to join dataframes
df_joint = list() # create a dictionary to store dataframes
for (date in names(all_df)) { # loop through dates
if (date %in% names(cat_df)) { # check if date is in both dataframes
df_A = all_df[[date]] # get dataframe from all-crypto as A
df_B = cat_df[[date]] # get dataframe from categories as B
df_joint[[date]] = list(df_A, df_B) # store dataframes in dictionary
}
}
return (df_joint)
}
double_dataframes = join_df(all_df, cat_df) # use above function to get join dataframes
label_df = function (df1, df2, cat) { # define function to label dataframes
for (i in 1: nrow(df1)) {
name = df1[i, "Symbol"] # get symbol from df1
if (name %in% df2$Symbol) {df1[i, "Category"] = cat} # check if symbol is in df2 and set it to cat
else {df1[i, "Category"] = "Others"} # if not, label as others
}
return (df1)
}
dataframes = list() # create a dictionary to store dataframes
for (date in names(double_dataframes)) {
df1 = double_dataframes[[date]][[1]] # get dataframe from all-crypto as df1
df2 = double_dataframes[[date]][[2]] # get dataframe from categories as df2
df = label_df(df1, df2, "AI") # call above function to set label
df = df[order(df$Category, decreasing = TRUE), ] # sort df by category
dataframes[[date]] = df # store dataframe in dictionary
}
plot_Mvol = function (date, df) { # define function to plot histogram for mass distribution of trading volume
histogram =
ggplot(df, aes(x = df$Volume24h, fill = Category)) + scale_color_manual(values = palette_alt) + gghistogram_style() +
scale_x_continuous(limits = c(1e-6, 1e11), name = "Logarithmic 24h Trading Volume (USD)", trans = 'log10') +
scale_y_continuous(limits = c(0, 400), name = "Token Counts") +
ggtitle(paste("Mass Distribution of Trading Volume", date))
return (histogram)
}
plot_Dvol = function (date, df) { # define function to plot histogram for density distribution of trading volume
histogram =
ggplot(df, aes(x = df$Volume24h, fill = Category)) + scale_color_manual(values = palette_alt) + ggdensity_style() +
scale_x_continuous(limits = c(1e-6, 1e11), name = "Logarithmic 24h Trading Volume (USD)", trans = 'log10') +
scale_y_continuous(limits = c(0, 0.4), name = "Token Counts") +
ggtitle(paste("Density Distribution of Trading Volume", date))
return (histogram)
}
plot_Mcap = function (date, df) { # define function to plot histogram for mass distribution of market capitalization
histogram =
ggplot(df, aes(x = df$MarketCap, fill = Category)) + scale_color_manual(values = palette_alt) + gghistogram_style() +
scale_x_continuous(limits = c(1, 1e13), name = "Logarithmic Market Capitalization (USD)", trans = 'log10') +
scale_y_continuous(limits = c(0, 400), name = "Token Counts") +
ggtitle(paste("Mass Distribution of Market Capitalization", date))
return (histogram)
}
plot_Dcap = function (date, df) { # define function to plot histogram for density distribution of market capitalization
histogram =
ggplot(df, aes(x = df$MarketCap, fill = Category)) + scale_color_manual(values = palette_alt) + ggdensity_style() +
scale_x_continuous(limits = c(1, 1e13), name = "Logarithmic Market Capitalization (USD)", trans = 'log10') +
scale_y_continuous(limits = c(0, 0.4), name = "Token Counts") +
ggtitle(paste("Density Distribution of Market Capitalization", date))
return (histogram)
}
grid.arrange(plot_Mvol("", df), plot_Mcap("", df), plot_Dvol("", df), plot_Dcap("", df), nrow = 2) # plot histograms together
#animate(dataframes, plot_Mvol)
#animate(dataframes, plot_Dvol)
#animate(dataframes, plot_Mcap)
#animate(dataframes, plot_Dcap)
plot_scatter = function (date, df) { # define function to plot scatterplot
scatterplot =
ggplot(df, aes(x = Volume24h, y = MarketCap, color = Category)) +
theme(legend.position = "none") + scale_color_manual(values = palette_alt) + ggscatter_style_def() +
scale_x_continuous(trans = 'log10', limits = c(1e-6, 1e11), name = "Logarithmic 24h Trading Volume (USD)") +
scale_y_continuous(trans = 'log10', limits = c(1, 1e13), name = "Logarithmic Market Capitalization (USD)") +
ggtitle(paste("Logarithmic Relationship Scatterplot (Unlabelled)", date))
return (scatterplot)
}
plot_scatter_label = function (date, df) { # define function to plot scatterplot with labels
scatterplot =
ggplot(df, aes(x = Volume24h, y = MarketCap, color = Category, label = Symbol)) +
theme(legend.position = "none") + scale_color_manual(values = palette_alt) + ggscatter_style_def() + gglabel_style() +
scale_x_continuous(trans = 'log10', limits = c(1e-6, 1e11), name = "Logarithmic 24h Trading Volume (USD)") +
scale_y_continuous(trans = 'log10', limits = c(1, 1e13), name = "Logarithmic Market Capitalization (USD)") +
ggtitle(paste("Logarithmic Relationship Scatterplot (Labelled)", date))
return (scatterplot)
}
plot_scatter("", df) # call above function to plot
plot_scatter_label("", df) # call above function to plot
#animate(dataframes, plot_scatter)
#animate(dataframes, plot_scatter_label)
log_transform = function (df) { # define function to perform logarithmic transformation
logdf = df[, c("Symbol", "Name", "Category")] # create a new dataframe to store subset of df
logdf$Volume24h_log = log10(df$Volume24h) # perform log10 transformation
logdf$MarketCap_log = log10(df$MarketCap) # perform log10 transformation
logdf = logdf[complete.cases(logdf), ] # remove NA
return (logdf)
}
logdf = log_transform(df)
dbscan_clean = function (df, selection) {
dbscan_subset = df[, c("Volume24h_log", "MarketCap_log")] # create a new dataframe to store subset of df
dbscan_subset = dbscan_subset[complete.cases(dbscan_subset), ] # remove NA
dbscan_result = dbscan(dbscan_subset, eps = 0.75, MinPts = 10) # perform DBSCAN(i.e. Density-Based Spatial Clustering of Applications with Noise)
df$Cluster = factor(dbscan_result$cluster) # add cluster column to df
df_clean = df[df$Cluster == 1, ] # subset df by cluster
if (selection == "original") {return (df)} # return original df
else {return (df_clean)} # return cleaned df
}
logdf = dbscan_clean(logdf, "original") # nothing change here
logdf_clean = dbscan_clean(logdf, "cleaned") # perform DBSCAN to clean df
dataframes_pre = list() # create a dictionary to store dataframes
dataframes_pos = list() # create a dictionary to store dataframes
for (name in names(dataframes)) {
df = log_transform(dataframes[[name]]) # perform log10 transformation
df1 = dbscan_clean(df, "original") # nothing change here
df2 = dbscan_clean(df, "cleaned") # perform DBSCAN to clean df
dataframes_pre[[name]] = df1 # store dataframe in dictionary
dataframes_pos[[name]] = df2 # store dataframe in dictionary
}
plot_cluster_pre = function (date, df) { # define function to plot clusterplot for DBSCAN result
clusterplot =
ggplot(df, aes(x = Volume24h_log, y = MarketCap_log, color = Cluster, label = Symbol)) +
theme(legend.position = "none") + scale_color_manual(values = palette_alt) + ggscatter_style_def() +
scale_x_continuous(limits = c(-5, 11), name = "Logarithmic 24h Trading Volume (USD)") +
scale_y_continuous(limits = c(-5, 12), name = "Logarithmic Market Capitalization (USD)") +
ggtitle(paste("DBSCAN Clustering Result", date))
return (clusterplot)
}
plot_cluster_pos = function (date, df) { # define function to plot clusterplot for DBSCAN result
clusterplot =
ggplot(df, aes(x = Volume24h_log, y = MarketCap_log, color = Category, label = Symbol)) +
theme(legend.position = "none") + scale_color_manual(values = palette_alt) + ggscatter_style_def() +
scale_x_continuous(limits = c(-5, 11), name = "Logarithmic 24h Trading Volume (USD)") +
scale_y_continuous(limits = c(-5, 12), name = "Logarithmic Market Capitalization (USD)") +
ggtitle(paste("DBSCAN Clustering Result", date))
return (clusterplot)
}
grid.arrange(plot_cluster_pre("", logdf), plot_cluster_pos("", logdf_clean), nrow = 1) # plot clusterplots together
#animate(dataframes_pre, plot_cluster)
#animate(dataframes_pos, plot_cluster)
plot_qq = function (date, df) { # define function to plot qqplot
model = lm(MarketCap_log~Volume24h_log, data = df)
r = rstandard(model)
qqplot =
ggplot(df, aes(sample = r)) + ggqqplot_style() + ggqqline_style() +
scale_x_continuous(limits = c(-4, 4), name = "Theoretical Quantiles") +
scale_y_continuous(limits = c(-5, 5), name = "Sample Quantiles") +
ggtitle(paste("Quantile-Quantile Plot of Logarithmic Trading Volume", date))
return (qqplot)
}
plot_res = function (date, df) { # define function to plot residueplot
model = lm(MarketCap_log~Volume24h_log, data = df)
hii = hatvalues(model)
r = rstandard(model)
residueplot =
ggplot(df, aes(x = Volume24h_log, y = r)) + ggscatter_style() + gghline_style(-2) + gghline_style(2) +
scale_x_continuous(limits = c(-5, 11), name = "Logarithmic 24h Trading Volume (USD)") +
scale_y_continuous(limits = c(-5, 5), name = "Residue (Market Capitalization)") +
ggtitle(paste("Standard Residue Plot of Logarithmic Trading Volume", date))
return (residueplot)
}
grid.arrange(plot_qq("", logdf_clean), plot_res("", logdf_clean), nrow = 1) # plot qqplot and residueplot together
#animate(dataframes_pos, plot_qq)
#animate(dataframes_pos, plot_res)
plot_CI = function (date, df) { # define function to plot scatterplot with confidence interval
model = lm(MarketCap_log~Volume24h_log, data = df)
CIplot =
ggplot(df, aes(x = Volume24h_log, y = MarketCap_log)) +
ggscatter_style() + gginterval_confidence(model) + gginterval_prediction(model) +
scale_x_continuous(limits = c(-5, 11), name = "Logarithmic 24h Trading Volume (USD)") +
scale_y_continuous(limits = c(-5, 12), name = "Logarithmic Market Capitalization (USD)") +
ggtitle(paste("Scatterplot with Confidence Interval", date))
return (CIplot)
}
plot_CI("", logdf_clean) # call above function to plot
#animate(dataframes_pos, plot_CI)
model = lm(MarketCap_log~Volume24h_log, data = logdf_clean) # perform linear regression
anova(model) # perform ANOVA test
## Analysis of Variance Table
##
## Response: MarketCap_log
## Df Sum Sq Mean Sq F value Pr(>F)
## Volume24h_log 1 3091.2 3091.17 4612.4 < 2.2e-16 ***
## Residuals 3858 2585.6 0.67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qf(0.95, 1, nrow(logdf_clean) - 1) # calculate F critical value
## [1] 3.84387