Quantifying the Relationship Between Trading Volume and Market Cap in Cryptocurrencies

Preparation

Prepare libraries

# library command use to load packages
library(ggplot2) # for plotting
library(scales) # for plotting
library(stats) # for statistics
library(broom) # for statistics
library(gridExtra) # for plotting
library(stringr) # for string manipulation
library(animation) # for animation
library(fpc) # for clustering
# set random seed to ensure reproducibility
set.seed(142857)

Set global ggplot styles

# set global ggplot styles
dev.new(width = 2880, height = 1080, unit = "px") # set plot size

theme_set(theme_bw()) # set theme
theme_update(plot.title = element_text(hjust = 0.5)) # set title alignment

palette = c("#B2182B", "#D6604D", "#F4A582", "#FDDBC7", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC") # set color palette
palette_alt = numeric(length(palette)) # create an array to store the alternative palette

# set alternative palette in for loop
for (i in 1:length(palette)) {if (i %% 2 == 1) {palette_alt[i] <- palette[(i + 1) %/% 2]} 
                              else {palette_alt[i] <- palette[length(palette) - (i - 1) %/% 2]}}

gghistogram_style = function () {geom_histogram(bins = 60, alpha = 0.75)} # set histogram style
ggdensity_style = function () {geom_density(colour = "white", alpha = 0.75)} # set density style

ggscatter_style_def = function () {geom_point(size = 1.5, shape = 16, alpha = 0.75)} # set scatterplot style
ggscatter_style = function () {geom_point(colour = "#2166AC", size = 1.5, shape = 16, alpha = 0.75)} # set scatterplot style

gglabel_style = function () {geom_text(vjust = -0.5)} # set label style

ggqqplot_style = function () {geom_qq(colour = "#2166AC", size = 1.5, shape = 16, alpha = 0.75)} # set qqplot style
ggqqline_style = function () {geom_qq_line(linetype = "dashed", linewidth = 1.1, color = "#B2182B")} # set qqline style

gghline_style = function (x) {geom_hline(yintercept = x, linetype = "dashed", linewidth = 1.1, color = "#B2182B")} # set hline style

gginterval_confidence = function (model) { # set confidence interval style
  geom_ribbon(aes(ymin = predict(model, interval = "confidence")[, 2],
      ymax = predict(model, interval = "confidence")[, 3]),
    color = "#B2182B", fill = "#B2182B", alpha = 0.2)
}

gginterval_prediction = function (model) { # set prediction interval style
  geom_ribbon(aes(ymin = predict(model, interval = "prediction")[, 2],
      ymax = predict(model, interval = "prediction")[, 3]),
    color = "#FDDBC7", fill = "#FDDBC7", alpha = 0.2)
}

Define animation function

animate = function (dataframes, func) { # set animation function
  saveGIF({
      for (name in names(dataframes))
        print(func(name, dataframes[[name]])) # call given func to plot
    },
    movie.name = paste(deparse(substitute(func)), ".gif"), # set gif name
    interval = 0.1, ani.width = 600, ani.height = 400) # set gif parameters
}

Data Ingestion

Load files

all_path = "C:\\Users\\marcc\\My Drive\\Data Extraction\\all-crypto-daily" # set path
cat_path = "C:\\Users\\marcc\\My Drive\\Data Extraction\\categories-daily" # set path

all_file = list.files(all_path, pattern = "*.csv", full.names = TRUE) # load files
cat_file = list.files(cat_path, pattern = "artificial*", full.names = TRUE) # load files

get_date = function (string) { # define function to extract date from file name
  regex = "([0-9]{4}-[0-9]{2}-[0-9]{2})" # set regex pattern (e.g. all pattern in this format 2018-01-01)
  date = str_extract(string, regex) # extract date
  return (date)
}

read_df = function (file_list) {
  data_dict = list() # create a dictionary to store dataframes
  for (file in file_list) { # loop through files
    date = get_date(file) # extract date
    df = read.csv(file) # read csv (aka comma separated values) file

    df$MarketCap = as.numeric(gsub(",", "", df$MarketCap)) # remove commas
    df$Volume24h = as.numeric(gsub(",", "", df$Volume24h)) # remove commas
    df = df[!is.na(df$MarketCap), ] # remove NA
    df = df[!is.na(df$Volume24h), ] # remove NA
    df = df[!df$MarketCap == 0, ] # remove 0
    df = df[!df$Volume24h == 0, ] # remove 0

    date = substr(file, nchar(file) - 22, nchar(file) - 13)  # extract date
    df[["Ratio"]] = df$Volume24h / df$MarketCap  # calculate ratio

    data_dict[[date]] = df # store dataframe in dictionary
  }
  return (data_dict)
}

all_df = read_df(all_file) # read files
cat_df = read_df(cat_file) # read files
df = all_df[[1]] # set df to be the first dataframe of all-crypto-daily

Merge using date

join_df = function (all_df, cat_df) { # define function to join dataframes
  df_joint = list() # create a dictionary to store dataframes
  for (date in names(all_df)) { # loop through dates
    if (date %in% names(cat_df)) { # check if date is in both dataframes
      df_A = all_df[[date]] # get dataframe from all-crypto as A
      df_B = cat_df[[date]] # get dataframe from categories as B
      df_joint[[date]] = list(df_A, df_B) # store dataframes in dictionary
    }
  }
  return (df_joint)
}

double_dataframes = join_df(all_df, cat_df) # use above function to get join dataframes

Label data with categories

label_df = function (df1, df2, cat) { # define function to label dataframes
  for (i in 1: nrow(df1)) {
    name = df1[i, "Symbol"] # get symbol from df1
    if (name %in% df2$Symbol) {df1[i, "Category"] = cat}  # check if symbol is in df2 and set it to cat
    else {df1[i, "Category"] = "Others"} # if not, label as others
  }
  return (df1)
}

dataframes = list() # create a dictionary to store dataframes

for (date in names(double_dataframes)) {
  df1 = double_dataframes[[date]][[1]] # get dataframe from all-crypto as df1
  df2 = double_dataframes[[date]][[2]] # get dataframe from categories as df2
  df = label_df(df1, df2, "AI") # call above function to set label
  df = df[order(df$Category, decreasing = TRUE), ] # sort df by category
  dataframes[[date]] = df # store dataframe in dictionary
}

Data Preview

Logarithmic histogram

plot_Mvol = function (date, df) {  # define function to plot histogram for mass distribution of trading volume
  histogram =
    ggplot(df, aes(x = df$Volume24h, fill = Category)) + scale_color_manual(values = palette_alt) + gghistogram_style() + 
    scale_x_continuous(limits = c(1e-6, 1e11), name = "Logarithmic 24h Trading Volume (USD)", trans = 'log10') +
    scale_y_continuous(limits = c(0, 400), name = "Token Counts") +
    ggtitle(paste("Mass Distribution of Trading Volume", date))
  return (histogram)
}

plot_Dvol = function (date, df) { # define function to plot histogram for density distribution of trading volume
  histogram =
    ggplot(df, aes(x = df$Volume24h, fill = Category)) + scale_color_manual(values = palette_alt) + ggdensity_style() +
    scale_x_continuous(limits = c(1e-6, 1e11), name = "Logarithmic 24h Trading Volume (USD)", trans = 'log10') +
    scale_y_continuous(limits = c(0, 0.4), name = "Token Counts") +
    ggtitle(paste("Density Distribution of Trading Volume", date))
  return (histogram)
}

plot_Mcap = function (date, df) { # define function to plot histogram for mass distribution of market capitalization
  histogram = 
    ggplot(df, aes(x = df$MarketCap, fill = Category)) + scale_color_manual(values = palette_alt) + gghistogram_style() +
    scale_x_continuous(limits = c(1, 1e13), name = "Logarithmic Market Capitalization (USD)", trans = 'log10') +
    scale_y_continuous(limits = c(0, 400), name = "Token Counts") +
    ggtitle(paste("Mass Distribution of Market Capitalization", date))
  return (histogram)
}

plot_Dcap = function (date, df) { # define function to plot histogram for density distribution of market capitalization
  histogram =
    ggplot(df, aes(x = df$MarketCap, fill = Category)) + scale_color_manual(values = palette_alt) + ggdensity_style() +
    scale_x_continuous(limits = c(1, 1e13), name = "Logarithmic Market Capitalization (USD)", trans = 'log10') +
    scale_y_continuous(limits = c(0, 0.4), name = "Token Counts") +
    ggtitle(paste("Density Distribution of Market Capitalization", date))
  return (histogram)
}

grid.arrange(plot_Mvol("", df), plot_Mcap("", df), plot_Dvol("", df), plot_Dcap("", df), nrow = 2) # plot histograms together

#animate(dataframes, plot_Mvol)
#animate(dataframes, plot_Dvol)
#animate(dataframes, plot_Mcap)
#animate(dataframes, plot_Dcap)

Logarithmic scatterplot

plot_scatter = function (date, df) { # define function to plot scatterplot
  scatterplot =
    ggplot(df, aes(x = Volume24h, y = MarketCap, color = Category)) + 
    theme(legend.position = "none") + scale_color_manual(values = palette_alt) + ggscatter_style_def() +
    scale_x_continuous(trans = 'log10', limits = c(1e-6, 1e11), name = "Logarithmic 24h Trading Volume (USD)") +
    scale_y_continuous(trans = 'log10', limits = c(1, 1e13), name = "Logarithmic Market Capitalization (USD)") +
    ggtitle(paste("Logarithmic Relationship Scatterplot (Unlabelled)", date))
  return (scatterplot)
}

plot_scatter_label = function (date, df) { # define function to plot scatterplot with labels
  scatterplot =
    ggplot(df, aes(x = Volume24h, y = MarketCap, color = Category, label = Symbol)) + 
    theme(legend.position = "none") + scale_color_manual(values = palette_alt) + ggscatter_style_def() + gglabel_style() + 
    scale_x_continuous(trans = 'log10', limits = c(1e-6, 1e11), name = "Logarithmic 24h Trading Volume (USD)") +
    scale_y_continuous(trans = 'log10', limits = c(1, 1e13), name = "Logarithmic Market Capitalization (USD)") +
    ggtitle(paste("Logarithmic Relationship Scatterplot (Labelled)", date))
  return (scatterplot)
}

plot_scatter("", df) # call above function to plot

plot_scatter_label("", df) # call above function to plot

#animate(dataframes, plot_scatter)
#animate(dataframes, plot_scatter_label)

Data Transforming & Cleaning

Perform logarithmic transformation

log_transform = function (df) { # define function to perform logarithmic transformation
  logdf = df[, c("Symbol", "Name", "Category")] # create a new dataframe to store subset of df
  logdf$Volume24h_log = log10(df$Volume24h) # perform log10 transformation
  logdf$MarketCap_log = log10(df$MarketCap) # perform log10 transformation
  logdf = logdf[complete.cases(logdf), ] # remove NA
  return (logdf)
}

logdf = log_transform(df)

Perform DBSCAN to identify outliers

dbscan_clean = function (df, selection) {
  dbscan_subset = df[, c("Volume24h_log", "MarketCap_log")] # create a new dataframe to store subset of df
  dbscan_subset = dbscan_subset[complete.cases(dbscan_subset), ] # remove NA

  dbscan_result = dbscan(dbscan_subset, eps = 0.75, MinPts = 10) # perform DBSCAN(i.e. Density-Based Spatial Clustering of Applications with Noise)
  df$Cluster = factor(dbscan_result$cluster) # add cluster column to df
  df_clean = df[df$Cluster == 1, ] # subset df by cluster

  if (selection == "original") {return (df)}  # return original df
  else {return (df_clean)} # return cleaned df
}

logdf = dbscan_clean(logdf, "original") # nothing change here
logdf_clean = dbscan_clean(logdf, "cleaned") # perform DBSCAN to clean df

dataframes_pre = list() # create a dictionary to store dataframes
dataframes_pos = list() # create a dictionary to store dataframes

for (name in names(dataframes)) {
  df = log_transform(dataframes[[name]]) # perform log10 transformation
  df1 = dbscan_clean(df, "original") # nothing change here
  df2 = dbscan_clean(df, "cleaned") # perform DBSCAN to clean df
  dataframes_pre[[name]] = df1 # store dataframe in dictionary
  dataframes_pos[[name]] = df2 # store dataframe in dictionary
}

DBSCAN result scatterplot

plot_cluster_pre = function (date, df) {  # define function to plot clusterplot for DBSCAN result
  clusterplot =
    ggplot(df, aes(x = Volume24h_log, y = MarketCap_log, color = Cluster, label = Symbol)) + 
    theme(legend.position = "none") + scale_color_manual(values = palette_alt) + ggscatter_style_def() + 
    scale_x_continuous(limits = c(-5, 11), name = "Logarithmic 24h Trading Volume (USD)") +
    scale_y_continuous(limits = c(-5, 12), name = "Logarithmic Market Capitalization (USD)") +
    ggtitle(paste("DBSCAN Clustering Result", date))
  return (clusterplot)
}

plot_cluster_pos = function (date, df) { # define function to plot clusterplot for DBSCAN result
  clusterplot =
    ggplot(df, aes(x = Volume24h_log, y = MarketCap_log, color = Category, label = Symbol)) + 
    theme(legend.position = "none") + scale_color_manual(values = palette_alt) + ggscatter_style_def() + 
    scale_x_continuous(limits = c(-5, 11), name = "Logarithmic 24h Trading Volume (USD)") +
    scale_y_continuous(limits = c(-5, 12), name = "Logarithmic Market Capitalization (USD)") +
    ggtitle(paste("DBSCAN Clustering Result", date))
  return (clusterplot)
}

grid.arrange(plot_cluster_pre("", logdf), plot_cluster_pos("", logdf_clean), nrow = 1) # plot clusterplots together

#animate(dataframes_pre, plot_cluster)
#animate(dataframes_pos, plot_cluster)

Regression Analysis

QQ-plot and residue plot

plot_qq = function (date, df) { # define function to plot qqplot
  model = lm(MarketCap_log~Volume24h_log, data = df)
  r = rstandard(model)

  qqplot =
    ggplot(df, aes(sample = r)) + ggqqplot_style() + ggqqline_style() + 
    scale_x_continuous(limits = c(-4, 4), name = "Theoretical Quantiles") +
    scale_y_continuous(limits = c(-5, 5), name = "Sample Quantiles") +
    ggtitle(paste("Quantile-Quantile Plot of Logarithmic Trading Volume", date))
  return (qqplot)
}

plot_res = function (date, df) { # define function to plot residueplot
  model = lm(MarketCap_log~Volume24h_log, data = df)
  hii = hatvalues(model)
  r = rstandard(model)

  residueplot =
    ggplot(df, aes(x = Volume24h_log, y = r)) + ggscatter_style() + gghline_style(-2) + gghline_style(2) +
    scale_x_continuous(limits = c(-5, 11), name = "Logarithmic 24h Trading Volume (USD)") +
    scale_y_continuous(limits = c(-5, 5), name = "Residue (Market Capitalization)") +
    ggtitle(paste("Standard Residue Plot of Logarithmic Trading Volume", date))
  return (residueplot)
}

grid.arrange(plot_qq("", logdf_clean), plot_res("", logdf_clean), nrow = 1) # plot qqplot and residueplot together

#animate(dataframes_pos, plot_qq)
#animate(dataframes_pos, plot_res)

Confidence and prediction intervals

plot_CI = function (date, df) { # define function to plot scatterplot with confidence interval
  model = lm(MarketCap_log~Volume24h_log, data = df)
  CIplot =
    ggplot(df, aes(x = Volume24h_log, y = MarketCap_log)) +
    ggscatter_style() + gginterval_confidence(model) + gginterval_prediction(model) +
    scale_x_continuous(limits = c(-5, 11), name = "Logarithmic 24h Trading Volume (USD)") +
    scale_y_continuous(limits = c(-5, 12), name = "Logarithmic Market Capitalization (USD)") +
    ggtitle(paste("Scatterplot with Confidence Interval", date))
  return (CIplot)
}

plot_CI("", logdf_clean) # call above function to plot

#animate(dataframes_pos, plot_CI)

ANOVA Test

model = lm(MarketCap_log~Volume24h_log, data = logdf_clean) # perform linear regression
anova(model) # perform ANOVA test

## Analysis of Variance Table
## 
## Response: MarketCap_log
##                 Df Sum Sq Mean Sq F value    Pr(>F)    
## Volume24h_log    1 3091.2 3091.17  4612.4 < 2.2e-16 ***
## Residuals     3858 2585.6    0.67                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

qf(0.95, 1, nrow(logdf_clean) - 1) # calculate F critical value

## [1] 3.84387

Quantifying the Relationship Between Trading Volume and Market Cap in Cryptocurrencies

A Data-driven approach from Logarithmic Transformation, DBSCAN, and Regression Analysis

Zhongmang (Marc) Cheng, Tianle (Thomas) Wang

2023-11-15

Preparation

Prepare libraries

Set global ggplot styles

Define animation function

Data Ingestion

Load files

Merge using date

Label data with categories

Data Preview

Logarithmic histogram

Logarithmic scatterplot

Data Transforming & Cleaning

Perform logarithmic transformation

Perform DBSCAN to identify outliers

DBSCAN result scatterplot

Regression Analysis

QQ-plot and residue plot

Confidence and prediction intervals

ANOVA Test