import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import calendar
from matplotlib.lines import Line2D

Strava is an activity tracking application that allows users to see their distance and route, and a whole host of other interesting activity data. Since I have been using this application for over 7 years now, I have built up a large cache of personal data in the app. To see how my activity in general and my activity of choice in particular have changed over the years, I created the following data visualizations.

View this data visualization as a web app

Import and view data

df = pd.read_csv ('~/Desktop/Code/StravaData/activities.csv')
df.head()
Activity ID Activity Date Activity Name Activity Type Activity Description Elapsed Time Distance Relative Effort Commute Activity Gear ... Cloud Cover Weather Visibility UV Index Weather Ozone <span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.jump_count">Jump Count</span> <span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.total_grit">Total Grit</span> <span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.avg_flow">Avg Flow</span> <span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.flagged">Flagged</span> <span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.avg_elapsed_speed">Avg Elapsed Speed</span> <span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.dirt_distance">Dirt Distance</span>
0 159187244 Jun 28, 2014, 4:48:31 PM Horsetooth Ride NaN 4641 14.67 NaN False NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 159314142 Jun 28, 2014, 10:41:53 PM Antelope trail Ride NaN 5455 8.79 NaN False NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 160874659 Jul 2, 2014, 4:07:30 PM North Fruita Desert Ride NaN 7846 21.89 NaN False NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 160971473 Jul 2, 2014, 8:00:21 PM Horsethief loop Ride NaN 5257 13.34 NaN False NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 161316446 Jul 3, 2014, 3:48:59 PM The Whole Enchilada (before phone died) Ride NaN 10706 16.55 NaN False NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 80 columns

Format activity date column as datetime object

df['Activity Date'] = list(map(lambda x: datetime.strptime(x,'%b %d, %Y, %H:%M:%S %p'),df['Activity Date']))
df['Year'] = list(map(lambda x: x.year, df['Activity Date']))
df['Weekday'] = list(map(lambda x: x.weekday(), df['Activity Date'])) # weekdays labeled 0-6 meaning Monday-Sunday
daylist = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
df['weekday'] = list(map(lambda x: daylist[x], df['Weekday'])) # weekdays labeled Monday-Sunday
df['Distance mi'] = list(map(lambda x: x*0.621371, df['Distance']))
years = np.unique(np.array(df['Year']))
array = np.zeros((years.size,7,53))

Create activity arrays for visualization

yearcount = 0
for i in range(1,df['Activity Date'].size):
    lastdate = df['Activity Date'][i-1]
    date = df['Activity Date'][i]
    x = (date.weekday(),date.week,date.year)
    if date.year != lastdate.year:
        yearcount+=1
    if df['Activity Type'][i] == 'Run':
        array[yearcount,date.weekday(),date.week%53] = 1
    if df['Activity Type'][i] == 'Ride':
        array[yearcount,date.weekday(),date.week%53] = 2
    if df['Activity Type'][i] == 'Nordic Ski':
        array[yearcount,date.weekday(),date.week%53] = 3
    if df['Activity Type'][i] == 'Hike':
        array[yearcount,date.weekday(),date.week%53] = 4
    if df['Activity Type'][i] == 'Alpine Ski':
        array[yearcount,date.weekday(),date.week%53] = 5
    if df['Activity Type'][i] == 'Canoe':
        array[yearcount,date.weekday(),date.week%53] = 6

Define visualization colors

none = 'black'
run = 'maroon'
ride = 'darkorange'
nordicski = 'darkcyan'
hike = 'yellow'
alpineski = 'azure'
canoe = 'midnightblue'

Create visualization

plt.style.use('fivethirtyeight')
fig, axs = plt.subplots(8,figsize=(24,28))
plt.subplots_adjust(hspace=0.1)
for j in range(0,years.size):
    if j == years.size-1:
        axs[j].set_xlabel('Week of the Year',fontsize=26, color = 'w')
    if j == 0:
        axs[j].set_title('Strava Activities',fontsize=26, color = 'w')
    axs[j].set_facecolor('black')
    axs[j].grid(False)
    axs[j].set_ylabel(f"{years[j]}",fontsize=26, color = 'w')
    fig.patch.set_facecolor('black')
    activities = array[j]
    X,Y = np.meshgrid(np.arange(activities.shape[1]), np.arange(activities.shape[0]))
    colors = {0.0:none,1.0:run, 2.0:ride, 3.0:nordicski,
              4.0:hike, 5.0:alpineski, 6.0:canoe}
    axs[j].scatter(X.flatten(), abs(Y.flatten()-6), c=pd.Series(activities.flatten()).map(colors), s = 500)
    axs[j].set_xlim(-1,53)
    axs[j].set_ylim(-1,7)
    axs[j].set_yticks(ticks = [6,5,4,3,2,1,0])
    axs[j].set_yticklabels(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],
                           fontsize=20, color = 'w')
    axs[j].spines['top'].set_visible(False)
    axs[j].spines['right'].set_visible(False)
    axs[j].spines['bottom'].set_visible(False)
    axs[j].spines['left'].set_visible(False)
    axs[j].set_xticks(ticks = np.linspace(0,52,27))
    axs[j].set_xticklabels(labels = (np.linspace(0,52,27,dtype=int)),fontsize=16, color = 'w')

custom_markers = [Line2D([0], [0], marker = "o", ms=22 , color=run, lw=0),
                Line2D([0], [0], marker = "o", ms=22 , color=ride, lw=0),
                Line2D([0], [0], marker = "o", ms=22 , color=hike, lw=0),
                Line2D([0], [0], marker = "o", ms=22 , color=alpineski, lw=0),
                Line2D([0], [0], marker = "o", ms=22 , color=nordicski, lw=0),
                Line2D([0], [0], marker = "o", ms=22 , color=canoe, lw=0)]
plt.legend(custom_markers, ['Run', 'Ride', 'Hike','Alpine Ski','Nordic Ski','Canoe'],
                      loc=(0.03,7.82),fontsize=20,labelcolor='w',facecolor='black')
plt.show()

png

Additional visualizations

def group_and_count(colname):
    count_df = df.groupby([colname]).count()['Activity ID']
    count_df = count_df.sort_values(ascending=False)
    x_labs = np.array(count_df.index)
    y_vals = np.array(count_df)
    return x_labs,y_vals

def charter(xlabs,y,title):
    plt.figure(figsize=(10,6))
    xpoints = np.linspace(start = 0, stop = len(y)-1,num = len(y))
    plt.bar(x = xpoints, height = y, width = 0.9, tick_label = xlabs, color = 'teal')
    plt.title(title)
    plt.xticks(fontsize=16)
    plt.grid(None,axis='x')
    for count, yval in enumerate(y):
        plt.annotate(text = f"{yval}", xy = (count-0.15,yval+5),fontsize=14)
    plt.show()
results = group_and_count('Activity Type')
charter(results[0],results[1],'Activity Count by Type (2014-2021)')

png

results = group_and_count('Year')
charter(results[0],results[1],'Activity Count by Year (2014-2021)')

png

results = group_and_count('weekday')
charter(results[0],results[1],'Activity Count by Weekday (2014-2021)')

png

avg_dist = df.groupby(['Activity Type']).mean()['Distance mi'].sort_values(ascending=False)

activity_count_dict = df.groupby(['Activity Type']).count()['Activity ID'].to_dict() # dict for annotations

y = avg_dist
xlabs = avg_dist.index
plt.figure(figsize=(10,6))
xpoints = np.linspace(start = 0, stop = len(y)-1,num = len(y))
plt.bar(x = xpoints, height = y, width = 0.9, tick_label = xlabs, color = 'teal')
plt.title("Average Distance by Activity Type (2014-2021)")
plt.ylabel("Distance (mi)")
plt.xticks(fontsize=16)
plt.grid(None,axis='x')
for count, yval in enumerate(y):
    plt.annotate(text = f"{yval:.2f}", xy = (count-0.15,yval+0.5),fontsize=14)
    plt.annotate(text = f"n = {activity_count_dict[xlabs[count]]}", xy = (count-0.3,yval-1),fontsize=14)
plt.show()

png

def specific_df(activity):
    specific_df = df.where(df['Activity Type']==activity,np.nan)
    return specific_df.dropna(axis=0,how='all').reset_index()
ride_df = specific_df('Ride')
run_df = specific_df('Run')
ski_df = specific_df('Nordic Ski')
hike_df = specific_df('Hike')
plt.figure(figsize=(10,6))
plt.boxplot([ride_df['Distance mi'],run_df['Distance mi'],ski_df['Distance mi'],hike_df['Distance mi']],
            labels = ['Ride','Run','Nordic Ski','Hike'])
plt.ylabel("Distance (mi)")
plt.title("Average Distance by Activity Type (2014-2021)")
plt.show()

png

plt.figure(figsize=(10,8))
plt.hist([ride_df['Distance mi'],run_df['Distance mi'],ski_df['Distance mi'],hike_df['Distance mi']],
         histtype = 'step', alpha = 1.0, range = (0,60), bins=60,
         label = ["Ride","Run",'Nordic Ski','Hike'],color = ['blue','red','teal','orange'],lw=1.0)
plt.xlim(-1,30)
plt.title("Distance Distribution by Activity Type (2014-2021)")
plt.xlabel("Distance (mi)")
plt.ylabel("Occurrences")
plt.legend()
plt.show()

png

plt.figure(figsize=(10,8))
plt.hist([ride_df['Distance mi'],run_df['Distance mi'],ski_df['Distance mi'],hike_df['Distance mi']],
         histtype = 'step', alpha = 1.0, range = (0,60), bins=120, cumulative = True, density = True,
         label = ["Ride","Run",'Nordic Ski','Hike'],color = ['blue','red','teal','orange'],lw=1.0)
plt.xlim(-1,20)
plt.title("Empirical Cumulative Distribution Function\nby Activity Type (2014-2021)")
plt.xlabel("Distance (mi)")
plt.ylabel("Probability")
plt.legend(loc='lower right')
plt.show()

png

Just for interest, I’ll create a simple model to calculate the probability that any of my rides is greater than a certain distance.

from statsmodels.distributions.empirical_distribution import ECDF
ecdf = ECDF(ride_df['Distance mi']) # create model with ride data
print(f"*Based on my personal riding history*")
print(f"Distance: 5 miles  \t Probability of Riding Farther: {1-ecdf(5):.4f}")
print(f"Distance: 10 miles \t Probability of Riding Farther: {1-ecdf(10):.4f}")
print(f"Distance: 15 miles \t Probability of Riding Farther: {1-ecdf(15):.4f}")
print(f"Distance: 20 miles \t Probability of Riding Farther: {1-ecdf(20):.4f}")
print(f"Distance: 30 miles \t Probability of Riding Farther: {1-ecdf(30):.4f}")
*Based on my personal riding history*
Distance: 5 miles  	 Probability of Riding Farther: 0.8468
Distance: 10 miles 	 Probability of Riding Farther: 0.4393
Distance: 15 miles 	 Probability of Riding Farther: 0.1908
Distance: 20 miles 	 Probability of Riding Farther: 0.0838
Distance: 30 miles 	 Probability of Riding Farther: 0.0260