Pandas is extremely good at manipulating data. It can also compute new information on the fly. For a project, I had to manipulate timeseries data, to compute the probability that one variable be of values {-1, 0, 1, 2} given past values of the same half hour, every week.

# Import all libraries needed for the tutorial
import pandas as pd
import pandasql as ps
import numpy as np
import matplotlib.pyplot as plt
import sys # Only needed to determine Python version number
import matplotlib # Only needed to determine Matplotlib version number
from matplotlib.pylab import rcParams
# Enable inline plotting
% matplotlib inline
rcParams [ 'figure.figsize' ] = 10 , 10
print ( 'Python version ' + sys . version )
print ( 'Pandas version ' + pd . __version__ )
print ( 'Matplotlib version ' + matplotlib . __version__ )
dateparse = lambda x : pd . datetime . strptime ( x , ' % Y- % m- % d % H: % M: % S' )
def add_half_hour ( x ):
"""Add a new column with half hour indices given a Dataframe row."""
x [ 'half_hour' ] = ( x . index . hour * 60 + x . index . minute ) / 30
return x
def calc_probabilities ( s ):
"""Returns a probability given a Series object."""
temp = s . value_counts ()
return temp / temp . sum ()

%% time
# Read data and create index as timestamps on-the-fly
logging_data = pd . read_csv ( "./logging_data.csv" , parse_dates = [ 'server_timestamp' ],
index_col = 'server_timestamp' , date_parser = dateparse )

# Displays some information about the dataset
#logging_data.info()
#print logging_data.head()
#print '\nData Types:'
#print logging_data.dtypes
#logging_data.index

%% time
#.. take one charger
grouped = logging_data . groupby ( 'charger_id' )
group = list ( grouped )[ 2 ][ 1 ]
group . head () # Shows first few rows

%% time
#.. filter just one day
group [ 'day' ] = group [ 'cpc_timestamp' ] . apply ( lambda x : dateparse ( x ) . weekday ())
myday_group = group [ group [ 'day' ] == 2 ]
myday_group . head ()

%% time
#.. add half_hour indices
df2 = myday_group . groupby ( 'charger_id' ) . apply ( add_half_hour )

%% time
#.. calculate probabilities
res = df2 . groupby ( 'half_hour' )[ 'ev_connection_status' ] . apply ( calc_probabilities ) . sort_index ()

res

res . unstack ( level =- 1 ) . plot ( kind = 'bar' , stacked = True )