This post analyses 2 years worth of Oyster card data totalling over 750 journeys across 69 tube stations. You can view your own journey history if you've registered your Oyster card with TfL.
## import the required packages
from __future__ import division
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = (10.0, 6.0)
import re
from shapely.geometry import Point
import geopandas as gpd
import folium
from folium import plugins
from textwrap import wrap
## you can read in your own data here
df = pd.read_excel('../../../Dropbox/tube.xlsx')
# we're only interested in tube journeys so let's ignore bus journeys and top-up information
ignore = ['Auto top-up', 'Topped up', 'Topped-up', 'Bus journey', 'Entered']
df = df[~df.journey.str.contains('|'.join(ignore))].reset_index(drop=True)
The raw data provided by TfL contains the date, start and end times of the tube journey as well as the fare and card balance at the time of the journey.
df.sample(10)
Let's apply some transformations to extract the relevant information from the raw data.
## Append dates and times and convert to timestamp objects
df['start_time'] = (df.date.astype(str) + ' ' + df.start.astype(str)).astype('datetime64[ns]')
df['end_time'] = (df.date.astype(str) + ' ' + df.end.astype(str)).astype('datetime64[ns]')
## Calculate duration of journey in minutes
df['duration'] = (df.end_time - df.start_time).apply(lambda x: pd.Timedelta(0) if x.days != 0 else x)
df['minutes'] = df.duration.apply(lambda x: x.components.minutes + 60*x.components.hours)
## Remove strings between brackets i.e. [],() in journey column
df['journey'] = df.journey.apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))
df['start_station'] = df.journey.apply(lambda x: x.split(' to ')[0].strip())
df['end_station'] = df.journey.apply(lambda x: x.split(' to ')[-1].strip())
Now let's join the journey dataset with another dataset that has the geographic coordinates of every underground station obtained from here.
stations = pd.read_csv('../../../Dropbox/stations.csv')
## convert lat/lon coordinates to Shapely Point objects
stations['location'] = [Point(i) for i in zip(stations.lat, stations.lon)]
del stations['lat']
del stations['lon']
## convert station dataframe to a Geopandas dataframe
stations = gpd.GeoDataFrame(stations, geometry='location', crs={'init' :'epsg:4326'})
df = df.merge(stations, how='left', left_on='start_station', right_on='station')
df = df.merge(stations, how='left', left_on='end_station', right_on='station', suffixes=('_start', '_end'))
del df['station_start']
del df['station_end']
# folium doesn't render strings with apostrophes
stations.loc[:, 'station'] = stations.station.str.replace("'", '')
For each journey we now have the start/end times, the fare and the start/end station names and their coordinates.
df.sample(10)
print "Total number of journeys: {}".format(len(df))
print "Total number of unique stations visited: {}".format(np.unique(df[['start_station', 'end_station']].values).size)
print "Total number of unique journeys: {}".format(df.journey.nunique())
print "Total amount spent on tube journeys: £{}0".format(df.charge.sum())
print "Total time spent in a tube station: {} days".format(round(df.minutes.sum()/60/24, 2))
labels = df.start_station.value_counts()[:9].index
labels = ['\n'.join(wrap(l, 11)) for l in labels]
ax = df.start_station.value_counts()[:9].plot('bar', rot=0)
ax.set_xticklabels(labels)
plt.ylabel('frequency')
plt.tight_layout()
plt.title('Most popular departure stations');
labels = df.journey.value_counts()[:10].index
labels = ['\n'.join(wrap(l, 11)) for l in labels]
ax = df.journey.value_counts()[:10].plot('bar', rot=0)
ax.set_xticklabels(labels)
plt.tight_layout()
plt.ylabel('frequency')
plt.yticks(np.arange(0, 100, 10))
plt.title('Most frequent journeys');
df.date.dt.weekday_name.value_counts().plot('bar', rot=True)
plt.ylabel('frequency')
plt.title('Number of journeys by day of the week')
plt.tight_layout();
df.groupby(df.start_time.dt.hour).count()['start_time'].plot('bar', rot=True)
plt.ylabel('frequency')
plt.title('Number of journeys by hour of the day')
plt.tight_layout();
df.minutes.hist(bins=df.minutes.max() - df.minutes.min())
plt.xlim(0, 65)
plt.ylim(0, 50)
plt.xlabel('Journey length (mins)')
plt.ylabel('frequency')
plt.xticks(np.arange(0, 70, 5))
plt.title('Distribution of journey times')
plt.tight_layout();
## get list of all visited stations along with their coordinates
visited_stations = stations[stations.station.isin(np.unique(df[['start_station', 'end_station']].values))]
m = folium.Map(tiles='cartodbpositron', location=(51.51, -0.1), zoom_start=12)
for i in visited_stations.itertuples():
folium.Marker(location = i.location.coords[0],
popup = i.station,
icon = folium.Icon(color='red')).add_to(m)
# add a fullscreen button
plugins.Fullscreen(
position='topright',
title='Fullscreen',
title_cancel='Exit',
force_separate_button=True).add_to(m)
m
def plot_routes(n=10):
"""
Plot n most common routes.
"""
m = folium.Map(tiles='cartodbpositron', location=(51.51, -0.15), zoom_start=13)
routes = df.groupby(['start_station', 'end_station']).size().sort_values(ascending=False).head(n)
for i in routes.iteritems():
folium.PolyLine(
[stations[stations.station == i[0][0]].location.iloc[0].coords[0],
stations[stations.station == i[0][1]].location.iloc[0].coords[0]],
weight = np.log(i[1] + 0.5),
popup = '{} to {}, {} journeys'.format(i[0][0], i[0][1], i[1]),
opacity = 1,
color = 'black').add_to(m)
## plot all unique start and end stations
s = set([i[0][0] for i in routes.head(n).iteritems()] + [i[0][1] for i in routes.head(n).iteritems()])
for i in stations[stations.station.isin(s)].itertuples():
folium.Marker(location = i.location.coords[0],
popup = i.station,
icon = folium.Icon(color='blue')).add_to(m)
# add a fullscreen button
plugins.Fullscreen(
position='topright',
title='Fullscreen',
title_cancel='Exit',
force_separate_button=True).add_to(m)
return m
plot_routes()
def station_network(s = 'Earls Court'):
'''
Returns map of all destination stations departing from `s`.
Thickness of lines is a function of the frequency of journeys.
Average journey time obtained from clicking on relevant line.
'''
## Plot journey start station as red flag
start_station = stations[stations.station == s]
if len(start_station) == 0:
return 'No journeys from this station.'
m = folium.Map(tiles='cartodbpositron', location=start_station.location.iloc[0].coords[0], zoom_start=13)
folium.Marker(location = start_station.location.iloc[0].coords[0],
popup = start_station.station.iloc[0],
icon = folium.Icon(color='red')).add_to(m)
## Obtain all destinations from this start station with average journey time
end_stations = df[df.start_station == s].groupby('end_station').agg({'end_station': 'count',
'location_end': 'min',
'minutes': 'mean'})
for i in end_stations.itertuples():
folium.PolyLine(
[start_station.location.iloc[0].coords[0],
i.location_end.coords[0]],
weight = np.log(i.end_station + 0.5),
popup = '{} journeys, average time: {} mins'.format(i.end_station, round(i.minutes, 1)),
opacity = 1,
color = 'black').add_to(m)
folium.Marker(location = i.location_end.coords[0],
popup = i.Index,
icon = folium.Icon(color='blue')).add_to(m)
# add a fullscreen button
plugins.Fullscreen(
position='topright',
title='Fullscreen',
title_cancel='Exit',
force_separate_button=True).add_to(m)
return m
station_network()
m = folium.Map(tiles='cartodbpositron', location=(51.51, -0.1), zoom_start=12)
## convert times to unix timestamp objects
times = [int(i/1000000) for i in df.start_time.values.tolist()]
plugins.TimestampedGeoJson({
'type': 'FeatureCollection',
'features': [
{'type': 'Feature',
'geometry': {
'type': 'LineString',
'coordinates': df.location_start.apply(lambda x: x.coords[0][::-1]).values.tolist()[::-1]},
'properties': {'times': times[::-1]}}]},
transition_time=500,
period='P1W',
).add_to(m)
m