%matplotlib inline

import os, re
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

USE_GPU = True

data_path = r'C:\Users\marcc\My Drive\Data Extraction\geckoscan-all'

# Preview data
preview = os.listdir(data_path)[-1]
df = pd.read_csv(os.path.join(data_path, preview))
df.head()

# Clean data
def clean_list(lst: list) -> list:
    lst = [str(item) for item in lst]
    lst = [item.replace(',', '') for item in lst]
    lst = [item for item in lst if item != "-"]
    lst = [item for item in lst if item != "nan"]
    lst = [float(item) for item in lst]
    lst = [item for item in lst if item != 0]
    return lst

# Logarithmic transformation
def log_transform(lst: list) -> list:
    if USE_GPU == True:
        vector = torch.tensor(lst).cuda()
        log_vector = torch.log10(vector)
        lst = log_vector.tolist()
    else:
        lst = np.log(lst)
    return lst

# Create cleaned dataset
mcap_sequence, tvol_sequence = [], []
for filename in os.listdir(data_path):

    date = re.search(r'\d{4}-\d{2}-\d{2}', filename).group()
    df = pd.read_csv(os.path.join(data_path, filename))

    mcap = df['MarketCap'].tolist()
    mcap = clean_list(mcap)
    mcap = log_transform(mcap)
    mcap_sequence.append((date, mcap))

    tvol = df['Volume24h'].tolist()
    tvol = clean_list(tvol)
    tvol = log_transform(tvol)
    tvol_sequence.append((date, tvol))

# Plot distribution
def plot_distribution(sequence, x: int, y: int):
    num_plots = len(sequence)
    num_cols = 10
    num_rows = (num_plots // num_cols) + (num_plots % num_cols > 0)
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(30, 30))

    for i, (date, lst) in enumerate(sequence):
        if i < num_plots:  # Check if there are still plots to be displayed
            row = i // num_cols
            col = i % num_cols
            axs[row, col].hist(lst, bins=30)
            axs[row, col].set_xlim(x)
            axs[row, col].set_ylim(y)
            axs[row, col].set_title(date)

    for i in range(num_plots, num_rows * num_cols):
        axs.flatten()[i].axis('off')

    plt.tight_layout()
    plt.show()

plot_distribution(mcap_sequence, (-2, 14), (0, 900))
plot_distribution(tvol_sequence, (-4, 12), (0, 1400))

# caculate statistics for one day
def calculate_stats(lst: list) -> list:
    mean = np.mean(lst)
    std_dev = np.std(lst)
    quantiles = np.percentile(lst, [10, 25, 50, 75, 90])

    stats = [mean, std_dev, quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4]]
    return stats

# caculate statistics for each day and store in a pandas dataframe
def calculate_stats_sequence(sequence) -> pd.DataFrame:
    stats_list = []
    for date, lst in sequence:
        stats = calculate_stats(lst)
        stats_list.append([date] + stats)

    df = pd.DataFrame(stats_list, columns = ['date', 'mean', 'std', '10th', '25th', 'median', '75th', '90th'])
    return df

stats_mcap = calculate_stats_sequence(mcap_sequence)
stats_tvol = calculate_stats_sequence(tvol_sequence)

# Preview statistics for market cap
stats_mcap = calculate_stats_sequence(mcap_sequence)
stats_mcap.head()

# Preview statistics for trading vol
stats_tvol = calculate_stats_sequence(tvol_sequence)
stats_tvol.head()

# plot statistics for mean, std, and percentiles
def plot_stats(dataframe: pd.DataFrame, title: str):
    fig, axs = plt.subplots(1, 3, figsize=(20, 5)) # set size

    # Mean over time
    axs[0].plot(dataframe['date'], dataframe['mean'], label='Mean')
    axs[0].set_xlabel('Date')
    axs[0].set_ylabel('Value')
    axs[0].set_title(f'Mean of {title} over Time')
    axs[0].xaxis.set_major_locator(mdates.MonthLocator())
    axs[0].grid(axis='y', linestyle='--')

    # SD over time
    axs[1].plot(dataframe['date'], dataframe['std'], label='Standard Deviation')
    axs[1].set_xlabel('Date')
    axs[1].set_ylabel('Value')
    axs[1].set_title(f'Standard Deviation of {title} over Time')
    axs[1].xaxis.set_major_locator(mdates.MonthLocator())
    axs[1].grid(axis='y', linestyle='--')

    # Quantiles over time
    axs[2].plot(dataframe['date'], dataframe['10th'], label='10th Percentile')
    axs[2].plot(dataframe['date'], dataframe['25th'], label='25th Percentile')
    axs[2].plot(dataframe['date'], dataframe['median'], label='Median')
    axs[2].plot(dataframe['date'], dataframe['75th'], label='75th Percentile')
    axs[2].plot(dataframe['date'], dataframe['90th'], label='90th Percentile')
    axs[2].set_xlabel('Date')
    axs[2].set_ylabel('Value')
    axs[2].set_title(f'Quantiles of {title} over Time')
    axs[2].legend(loc='center left', bbox_to_anchor=(1.05, 0.5))
    axs[2].xaxis.set_major_locator(mdates.MonthLocator())
    axs[2].grid(axis='y', linestyle='--')

    plt.show()

plot_stats(stats_mcap, "Market Capitalization")
plot_stats(stats_tvol, "Trading Volume")

# Percentile calculation function
def calculate_percentile(number: float, lst: list) -> float:
    count = 0
    for i in lst:
        if i <= number:
            count += 1
    percentile = (count / len(lst)) * 100
    return percentile

# Calculate percentile of a given coin at each day and store it in a pandas dataframe
def valuation(coin_of_interest: str):

    output = []
    for filename in os.listdir(data_path):

        date = re.search(r'\d{4}-\d{2}-\d{2}', filename).group()
        df = pd.read_csv(os.path.join(data_path, filename))
        row = df[df['Symbol'].str.lower() == coin_of_interest.lower()]

        coin_mcap = row['MarketCap'].to_list()
        coin_mcap = clean_list(coin_mcap)
        coin_mcap = log_transform(coin_mcap)

        if coin_mcap: 
            mcap = df['MarketCap'].tolist()
            mcap = clean_list(mcap)
            mcap = log_transform(mcap)
            percentile_mcap = calculate_percentile(coin_mcap[0], mcap)
        else: 
            percentile_mcap = 0

        coin_tvol = row['Volume24h'].to_list()
        coin_tvol = clean_list(coin_tvol)
        coin_tvol = log_transform(coin_tvol)
        coin_tvol = coin_tvol

        if coin_tvol:
            tvol = df['Volume24h'].tolist()
            tvol = clean_list(tvol)
            tvol = log_transform(tvol)
            percentile_tvol = calculate_percentile(coin_tvol[0], tvol)
        else:
            percentile_tvol = 0

        output.append([date, percentile_mcap, percentile_tvol])

    output = pd.DataFrame(output, columns=['Date', 'MarketCap Percentile', 'Volume24h Percentile'])
    return output

# Plot result
def plot_result(df, name):
    plt.subplots(figsize=(10, 5))
    plt.subplots_adjust(wspace=0.5, hspace=0.5)

    plt.plot(df['Date'], df['MarketCap Percentile'], label='MarketCap Percentile')
    plt.plot(df['Date'], df['Volume24h Percentile'], label='Volume24h Percentile')

    plt.xlabel('Date')
    plt.ylabel('Percentile')
    plt.title(f'Percentile Progression of {name} vs Date')
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
    plt.legend()
    plt.show()

# Demonstration for BTC
result = valuation("BTC")
plot_result(result, 'BTC')

# Demonstration for ETH
result = valuation("PEPE")
plot_result(result, 'PEPE')

	Unnamed: 0	id	Symbol	Name	image	Price	MarketCap	market_cap_rank	fully_diluted_valuation	Volume24h	...	total_supply	max_supply	ath	ath_change_percentage	ath_date	atl	atl_change_percentage	atl_date	roi	last_updated
0	0	bitcoin	btc	Bitcoin	https://assets.coingecko.com/coins/images/1/la...	68355.000	1.343466e+12	1.0	1.435840e+12	6.167528e+10	...	2.100000e+07	21000000.0	69428.00	-1.42662	2024-03-08T20:05:03.481Z	67.810000	1.008265e+05	2013-07-06T00:00:00.000Z	NaN	2024-03-09T05:16:02.259Z
1	1	ethereum	eth	Ethereum	https://assets.coingecko.com/coins/images/279/...	3927.860	4.719836e+11	2.0	4.719836e+11	2.642884e+10	...	1.201052e+08	NaN	4878.26	-19.36480	2021-11-10T14:24:19.604Z	0.432979	9.083959e+05	2015-10-20T00:00:00.000Z	{'times': 75.81178792735467, 'currency': 'btc'...	2024-03-09T05:16:02.231Z
2	2	tether	usdt	Tether	https://assets.coingecko.com/coins/images/325/...	1.001	1.017375e+11	3.0	1.017375e+11	9.669063e+10	...	1.015923e+11	NaN	1.32	-24.24184	2018-07-24T00:00:00.000Z	0.572521	7.507701e+01	2015-03-02T00:00:00.000Z	NaN	2024-03-09T05:15:13.214Z
3	3	binancecoin	bnb	BNB	https://assets.coingecko.com/coins/images/825/...	489.580	7.531749e+10	4.0	7.531749e+10	4.075490e+09	...	1.538562e+08	200000000.0	686.31	-28.64016	2021-05-10T07:24:17.097Z	0.039818	1.229874e+06	2017-10-19T00:00:00.000Z	NaN	2024-03-09T05:15:38.966Z
4	4	solana	sol	Solana	https://assets.coingecko.com/coins/images/4128...	147.180	6.522912e+10	5.0	8.410659e+10	5.485457e+09	...	5.713691e+08	NaN	259.96	-43.37098	2021-11-06T21:54:35.825Z	0.500801	2.929537e+04	2020-05-11T19:35:23.449Z	NaN	2024-03-09T05:15:53.054Z

	date	mean	std	10th	25th	median	75th	90th
0	2023-08-09	6.159759	1.305085	4.675918	5.317879	6.124504	7.034170	7.783340
1	2023-08-10	6.158493	1.306935	4.665913	5.318132	6.124457	7.038537	7.777979
2	2023-08-15	6.156271	1.302976	4.670294	5.309894	6.120470	7.032239	7.776703
3	2023-08-20	6.131251	1.298832	4.656052	5.289159	6.098831	7.017901	7.732382
4	2023-08-20	6.132795	1.299603	4.656592	5.289621	6.103521	7.014202	7.733663

	date	mean	std	10th	25th	median	75th	90th
0	2023-08-09	3.760884	1.909839	1.212187	2.528467	4.116629	5.041850	5.836456
1	2023-08-10	3.866598	1.893633	1.306425	2.715920	4.207285	5.113204	5.887423
2	2023-08-15	3.871511	1.901572	1.317395	2.725520	4.190132	5.109093	5.942390
3	2023-08-20	3.856813	1.834472	1.370883	2.720292	4.183303	5.066449	5.833962
4	2023-08-20	3.855298	1.850005	1.288607	2.741486	4.191136	5.096559	5.838132

No More Candlesticks: Isolation of Macroeconomic Effects from Price Movement¶

Zhongmang (Marc) Cheng¶

Abstract¶

0. Import libraries¶

1. Data preperation¶

2. Distribution plot¶

3. Caculate statistics¶

4. Visualization¶

5. Proposition¶

6. Demonstration¶